From 1d3795a3ab79613b42a4acb4066782aa2c42c58c Mon Sep 17 00:00:00 2001 From: Jens Steube Date: Sat, 23 Jan 2016 15:32:31 +0100 Subject: [PATCH] Converted _a3 kernels, use SIMD for CPU and GPU --- OpenCL/amp_a1.cl | 6 +- OpenCL/common.c | 5548 +++++++++++++----------------------- OpenCL/m00000_a1.cl | 8 +- OpenCL/m00000_a3.cl | 50 +- OpenCL/m00010_a0.cl | 4 +- OpenCL/m00010_a1.cl | 12 +- OpenCL/m00010_a3.cl | 52 +- OpenCL/m00020_a0.cl | 4 +- OpenCL/m00020_a1.cl | 12 +- OpenCL/m00020_a3.cl | 258 +- OpenCL/m00030_a0.cl | 4 +- OpenCL/m00030_a1.cl | 12 +- OpenCL/m00030_a3.cl | 52 +- OpenCL/m00040_a0.cl | 72 +- OpenCL/m00040_a1.cl | 80 +- OpenCL/m00040_a3.cl | 368 ++- OpenCL/m00050_a1.cl | 8 +- OpenCL/m00050_a3.cl | 112 +- OpenCL/m00060_a1.cl | 8 +- OpenCL/m00060_a3.cl | 120 +- OpenCL/m00100_a1.cl | 8 +- OpenCL/m00100_a3.cl | 436 ++- OpenCL/m00110_a0.cl | 4 +- OpenCL/m00110_a1.cl | 12 +- OpenCL/m00110_a3.cl | 470 ++- OpenCL/m00120_a0.cl | 4 +- OpenCL/m00120_a1.cl | 12 +- OpenCL/m00120_a3.cl | 308 +- OpenCL/m00130_a0.cl | 4 +- OpenCL/m00130_a1.cl | 12 +- OpenCL/m00130_a3.cl | 472 ++- OpenCL/m00140_a0.cl | 4 +- OpenCL/m00140_a1.cl | 12 +- OpenCL/m00140_a3.cl | 308 +- OpenCL/m00150_a1.cl | 8 +- OpenCL/m00150_a3.cl | 112 +- OpenCL/m00160_a1.cl | 8 +- OpenCL/m00160_a3.cl | 112 +- OpenCL/m00190_a1.cl | 8 +- OpenCL/m00190_a3.cl | 453 ++- OpenCL/m00200_a1.cl | 8 +- OpenCL/m00200_a3.cl | 72 +- OpenCL/m00300_a1.cl | 8 +- OpenCL/m00300_a3.cl | 500 ++-- OpenCL/m00900_a1.cl | 8 +- OpenCL/m00900_a3.cl | 118 +- OpenCL/m01000_a1.cl | 8 +- OpenCL/m01000_a3.cl | 124 +- OpenCL/m01100_a1.cl | 8 +- OpenCL/m01100_a3.cl | 84 +- OpenCL/m01400_a1.cl | 8 +- OpenCL/m01400_a3.cl | 148 +- OpenCL/m01410_a0.cl | 4 +- OpenCL/m01410_a1.cl | 12 +- OpenCL/m01410_a3.cl | 183 +- OpenCL/m01420_a0.cl | 4 +- OpenCL/m01420_a1.cl | 12 +- OpenCL/m01420_a3.cl | 385 +-- OpenCL/m01430_a0.cl | 4 +- OpenCL/m01430_a1.cl | 12 +- OpenCL/m01430_a3.cl | 183 +- OpenCL/m01440_a0.cl | 4 +- OpenCL/m01440_a1.cl | 12 +- OpenCL/m01440_a3.cl | 385 +-- OpenCL/m01450_a1.cl | 8 +- OpenCL/m01450_a3.cl | 118 +- OpenCL/m01460_a1.cl | 8 +- OpenCL/m01460_a3.cl | 118 +- OpenCL/m01500_a1.cl | 8 +- OpenCL/m01700_a1.cl | 8 +- OpenCL/m01700_a3.cl | 126 +- OpenCL/m01710_a0.cl | 4 +- OpenCL/m01710_a1.cl | 12 +- OpenCL/m01710_a3.cl | 160 +- OpenCL/m01720_a0.cl | 4 +- OpenCL/m01720_a1.cl | 12 +- OpenCL/m01720_a3.cl | 409 +-- OpenCL/m01730_a0.cl | 4 +- OpenCL/m01730_a1.cl | 12 +- OpenCL/m01730_a3.cl | 161 +- OpenCL/m01740_a0.cl | 4 +- OpenCL/m01740_a1.cl | 12 +- OpenCL/m01740_a3.cl | 409 +-- OpenCL/m01750_a1.cl | 8 +- OpenCL/m01750_a3.cl | 174 +- OpenCL/m01760_a1.cl | 8 +- OpenCL/m01760_a3.cl | 174 +- OpenCL/m02400_a1.cl | 8 +- OpenCL/m02400_a3.cl | 72 +- OpenCL/m02410_a0.cl | 4 +- OpenCL/m02410_a1.cl | 12 +- OpenCL/m02410_a3.cl | 80 +- OpenCL/m02610_a1.cl | 8 +- OpenCL/m02610_a3.cl | 724 ++--- OpenCL/m02710_a1.cl | 8 +- OpenCL/m02710_a3.cl | 742 ++--- OpenCL/m02810_a1.cl | 8 +- OpenCL/m02810_a3.cl | 742 ++--- OpenCL/m03000_a1.cl | 8 +- OpenCL/m03100_a0.cl | 4 +- OpenCL/m03100_a1.cl | 12 +- OpenCL/m03100_a3.cl | 343 +-- OpenCL/m03710_a0.cl | 4 +- OpenCL/m03710_a1.cl | 12 +- OpenCL/m03710_a3.cl | 384 +-- OpenCL/m03800_a0.cl | 8 +- OpenCL/m03800_a1.cl | 16 +- OpenCL/m03800_a3.cl | 92 +- OpenCL/m04310_a1.cl | 8 +- OpenCL/m04310_a3.cl | 724 ++--- OpenCL/m04400_a1.cl | 8 +- OpenCL/m04400_a3.cl | 126 +- OpenCL/m04500_a1.cl | 8 +- OpenCL/m04500_a3.cl | 130 +- OpenCL/m04700_a1.cl | 8 +- OpenCL/m04700_a3.cl | 797 +++--- OpenCL/m04800_a0.cl | 8 +- OpenCL/m04800_a1.cl | 16 +- OpenCL/m04800_a3.cl | 80 +- OpenCL/m04900_a0.cl | 8 +- OpenCL/m04900_a1.cl | 16 +- OpenCL/m04900_a3.cl | 152 +- OpenCL/m05000_a1.cl | 8 +- OpenCL/m05000_a3.cl | 110 +- OpenCL/m05100_a1.cl | 8 +- OpenCL/m05100_a3.cl | 106 +- OpenCL/m05200.cl | 2 +- OpenCL/m05300_a1.cl | 8 +- OpenCL/m05300_a3.cl | 110 +- OpenCL/m05400_a1.cl | 8 +- OpenCL/m05400_a3.cl | 136 +- OpenCL/m05500_a1.cl | 8 +- OpenCL/m05500_a3.cl | 226 +- OpenCL/m05600_a1.cl | 8 +- OpenCL/m05600_a3.cl | 196 +- OpenCL/m06000_a1.cl | 8 +- OpenCL/m06000_a3.cl | 76 +- OpenCL/m06100_a1.cl | 8 +- OpenCL/m06100_a3.cl | 118 +- OpenCL/m06900_a1.cl | 8 +- OpenCL/m06900_a3.cl | 95 +- OpenCL/m07300_a1.cl | 8 +- OpenCL/m07300_a3.cl | 112 +- OpenCL/m07500_a1.cl | 8 +- OpenCL/m07600_a1.cl | 8 +- OpenCL/m07600_a3.cl | 158 +- OpenCL/m07700_a0.cl | 4 +- OpenCL/m07700_a1.cl | 12 +- OpenCL/m07700_a3.cl | 4 +- OpenCL/m07800_a0.cl | 4 +- OpenCL/m07800_a1.cl | 12 +- OpenCL/m07800_a3.cl | 4 +- OpenCL/m08000_a1.cl | 8 +- OpenCL/m08000_a3.cl | 239 +- OpenCL/m08100_a1.cl | 8 +- OpenCL/m08100_a3.cl | 304 +- OpenCL/m08300_a0.cl | 12 +- OpenCL/m08300_a1.cl | 20 +- OpenCL/m08300_a3.cl | 144 +- OpenCL/m08400_a1.cl | 8 +- OpenCL/m08400_a3.cl | 170 +- OpenCL/m08500_a1.cl | 8 +- OpenCL/m08500_a3.cl | 176 +- OpenCL/m08600_a1.cl | 8 +- OpenCL/m08600_a3.cl | 108 +- OpenCL/m08700_a1.cl | 8 +- OpenCL/m08700_a3.cl | 206 +- OpenCL/m09700_a1.cl | 8 +- OpenCL/m09700_a3.cl | 128 +- OpenCL/m09710_a1.cl | 8 +- OpenCL/m09710_a3.cl | 114 +- OpenCL/m09720_a1.cl | 8 +- OpenCL/m09720_a3.cl | 140 +- OpenCL/m09800_a0.cl | 4 +- OpenCL/m09800_a1.cl | 12 +- OpenCL/m09800_a3.cl | 120 +- OpenCL/m09810_a1.cl | 8 +- OpenCL/m09810_a3.cl | 120 +- OpenCL/m09820_a0.cl | 4 +- OpenCL/m09820_a1.cl | 12 +- OpenCL/m09820_a3.cl | 112 +- OpenCL/m09900_a1.cl | 8 +- OpenCL/m09900_a3.cl | 98 +- OpenCL/m10100_a1.cl | 8 +- OpenCL/m10100_a3.cl | 104 +- OpenCL/m10300.cl | 2 +- OpenCL/m10400_a0.cl | 4 +- OpenCL/m10400_a1.cl | 12 +- OpenCL/m10400_a3.cl | 4 +- OpenCL/m10410_a1.cl | 8 +- OpenCL/m10410_a3.cl | 64 +- OpenCL/m10420_a0.cl | 4 +- OpenCL/m10420_a1.cl | 12 +- OpenCL/m10420_a3.cl | 114 +- OpenCL/m10500.cl | 2 +- OpenCL/m10800_a1.cl | 8 +- OpenCL/m10800_a3.cl | 124 +- OpenCL/m11000_a1.cl | 8 +- OpenCL/m11000_a3.cl | 84 +- OpenCL/m11100_a0.cl | 4 +- OpenCL/m11100_a1.cl | 12 +- OpenCL/m11100_a3.cl | 82 +- OpenCL/m11200_a1.cl | 8 +- OpenCL/m11200_a3.cl | 156 +- OpenCL/m11300.cl | 2 +- OpenCL/m11400_a1.cl | 8 +- OpenCL/m11400_a3.cl | 580 ++-- OpenCL/m11500_a1.cl | 8 +- OpenCL/m11500_a3.cl | 84 +- OpenCL/m11700_a1.cl | 8 +- OpenCL/m11700_a3.cl | 90 +- OpenCL/m11800_a1.cl | 8 +- OpenCL/m11800_a3.cl | 90 +- OpenCL/m12600_a1.cl | 8 +- OpenCL/m12600_a3.cl | 138 +- OpenCL/simd.c | 17 + OpenCL/types_ocl.c | 821 ++++-- include/kernel_functions.c | 13 +- include/kernel_vendor.h | 7 + include/shared.h | 6 +- src/oclHashcat.c | 102 +- tools/test.sh | 2 +- 222 files changed, 12369 insertions(+), 13551 deletions(-) diff --git a/OpenCL/amp_a1.cl b/OpenCL/amp_a1.cl index fc34f3872..d901b60bc 100644 --- a/OpenCL/amp_a1.cl +++ b/OpenCL/amp_a1.cl @@ -7,7 +7,7 @@ #include "include/kernel_vendor.h" #include "OpenCL/types_ocl.c" -static void switch_buffer_by_offset (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) +static void switch_buffer_by_offset_le (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) { #if defined IS_AMD || defined IS_GENERIC const int offset_mod_4 = offset & 3; @@ -789,12 +789,12 @@ __kernel void amp (__global pw_t *pws, __global pw_t *pws_amp, __global kernel_r if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, pw_r_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, pw_r_len); } u32 w0[4]; diff --git a/OpenCL/common.c b/OpenCL/common.c index 7c5d45dd5..6e78486f1 100644 --- a/OpenCL/common.c +++ b/OpenCL/common.c @@ -3,6 +3,10 @@ * License.....: MIT */ +/** + * pure scalar functions + */ + static int hash_comp (const u32 d1[4], __global u32 *d2) { if (d1[3] > d2[DGST_R3]) return ( 1); @@ -68,7 +72,11 @@ static void mark_hash (__global plain_t *plains_buf, __global u32 *hashes_shown, plains_buf[hash_pos].il_pos = il_pos; } -static void truncate_block (u32 w[4], const u32 len) +/** + * vector functions + */ + +static void truncate_block (u32x w[4], const u32 len) { switch (len) { @@ -131,31 +139,6 @@ static void truncate_block (u32 w[4], const u32 len) } } -static void make_unicode_S (const u32 in[4], u32 out1[4], u32 out2[4]) -{ - #ifdef IS_NV - out2[3] = __byte_perm_S (in[3], 0, 0x7372); - out2[2] = __byte_perm_S (in[3], 0, 0x7170); - out2[1] = __byte_perm_S (in[2], 0, 0x7372); - out2[0] = __byte_perm_S (in[2], 0, 0x7170); - out1[3] = __byte_perm_S (in[1], 0, 0x7372); - out1[2] = __byte_perm_S (in[1], 0, 0x7170); - out1[1] = __byte_perm_S (in[0], 0, 0x7372); - out1[0] = __byte_perm_S (in[0], 0, 0x7170); - #endif - - #if defined IS_AMD || defined IS_GENERIC - out2[3] = ((in[3] >> 8) & 0x00FF0000) | ((in[3] >> 16) & 0x000000FF); - out2[2] = ((in[3] << 8) & 0x00FF0000) | ((in[3] >> 0) & 0x000000FF); - out2[1] = ((in[2] >> 8) & 0x00FF0000) | ((in[2] >> 16) & 0x000000FF); - out2[0] = ((in[2] << 8) & 0x00FF0000) | ((in[2] >> 0) & 0x000000FF); - out1[3] = ((in[1] >> 8) & 0x00FF0000) | ((in[1] >> 16) & 0x000000FF); - out1[2] = ((in[1] << 8) & 0x00FF0000) | ((in[1] >> 0) & 0x000000FF); - out1[1] = ((in[0] >> 8) & 0x00FF0000) | ((in[0] >> 16) & 0x000000FF); - out1[0] = ((in[0] << 8) & 0x00FF0000) | ((in[0] >> 0) & 0x000000FF); - #endif -} - static void make_unicode (const u32x in[4], u32x out1[4], u32x out2[4]) { #ifdef IS_NV @@ -181,27 +164,6 @@ static void make_unicode (const u32x in[4], u32x out1[4], u32x out2[4]) #endif } -static void undo_unicode_S (const u32 in1[4], const u32 in2[4], u32 out[4]) -{ - #ifdef IS_NV - out[0] = __byte_perm_S (in1[0], in1[1], 0x6420); - out[1] = __byte_perm_S (in1[2], in1[3], 0x6420); - out[2] = __byte_perm_S (in2[0], in2[1], 0x6420); - out[3] = __byte_perm_S (in2[2], in2[3], 0x6420); - #endif - - #if defined IS_AMD || defined IS_GENERIC - out[0] = ((in1[0] & 0x000000ff) >> 0) | ((in1[0] & 0x00ff0000) >> 8) - | ((in1[1] & 0x000000ff) << 16) | ((in1[1] & 0x00ff0000) << 8); - out[1] = ((in1[2] & 0x000000ff) >> 0) | ((in1[2] & 0x00ff0000) >> 8) - | ((in1[3] & 0x000000ff) << 16) | ((in1[3] & 0x00ff0000) << 8); - out[2] = ((in2[0] & 0x000000ff) >> 0) | ((in2[0] & 0x00ff0000) >> 8) - | ((in2[1] & 0x000000ff) << 16) | ((in2[1] & 0x00ff0000) << 8); - out[3] = ((in2[2] & 0x000000ff) >> 0) | ((in2[2] & 0x00ff0000) >> 8) - | ((in2[3] & 0x000000ff) << 16) | ((in2[3] & 0x00ff0000) << 8); - #endif -} - static void undo_unicode (const u32x in1[4], const u32x in2[4], u32x out[4]) { #ifdef IS_NV @@ -223,7 +185,7 @@ static void undo_unicode (const u32x in1[4], const u32x in2[4], u32x out[4]) #endif } -static void append_0x01_1x4 (u32 w0[4], const u32 offset) +static void append_0x01_1x4 (u32x w0[4], const u32 offset) { switch (offset) { @@ -293,7 +255,7 @@ static void append_0x01_1x4 (u32 w0[4], const u32 offset) } } -static void append_0x01_2x4 (u32 w0[4], u32 w1[4], const u32 offset) +static void append_0x01_2x4 (u32x w0[4], u32x w1[4], const u32 offset) { switch (offset) { @@ -427,7 +389,7 @@ static void append_0x01_2x4 (u32 w0[4], u32 w1[4], const u32 offset) } } -static void append_0x01_3x4 (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset) +static void append_0x01_3x4 (u32x w0[4], u32x w1[4], u32x w2[4], const u32 offset) { switch (offset) { @@ -625,7 +587,7 @@ static void append_0x01_3x4 (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset) } } -static void append_0x01_4x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) +static void append_0x01_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset) { switch (offset) { @@ -887,7 +849,7 @@ static void append_0x01_4x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u } } -static void append_0x01_8x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset) +static void append_0x01_8x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset) { switch (offset) { @@ -1405,7 +1367,7 @@ static void append_0x01_8x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[ } } -static void append_0x02_1x4 (u32 w0[4], const u32 offset) +static void append_0x02_1x4 (u32x w0[4], const u32 offset) { switch (offset) { @@ -1475,7 +1437,7 @@ static void append_0x02_1x4 (u32 w0[4], const u32 offset) } } -static void append_0x02_2x4 (u32 w0[4], u32 w1[4], const u32 offset) +static void append_0x02_2x4 (u32x w0[4], u32x w1[4], const u32 offset) { switch (offset) { @@ -1609,7 +1571,7 @@ static void append_0x02_2x4 (u32 w0[4], u32 w1[4], const u32 offset) } } -static void append_0x02_3x4 (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset) +static void append_0x02_3x4 (u32x w0[4], u32x w1[4], u32x w2[4], const u32 offset) { switch (offset) { @@ -1807,7 +1769,7 @@ static void append_0x02_3x4 (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset) } } -static void append_0x02_4x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) +static void append_0x02_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset) { switch (offset) { @@ -2069,7 +2031,7 @@ static void append_0x02_4x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u } } -static void append_0x02_8x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset) +static void append_0x02_8x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset) { switch (offset) { @@ -2587,7 +2549,7 @@ static void append_0x02_8x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[ } } -static void append_0x80_1x4 (u32 w0[4], const u32 offset) +static void append_0x80_1x4 (u32x w0[4], const u32 offset) { switch (offset) { @@ -2791,7 +2753,7 @@ static void append_0x80_2x4 (u32x w0[4], u32x w1[4], const u32 offset) } } -static void append_0x80_3x4 (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset) +static void append_0x80_3x4 (u32x w0[4], u32x w1[4], u32x w2[4], const u32 offset) { switch (offset) { @@ -2989,7 +2951,7 @@ static void append_0x80_3x4 (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset) } } -static void append_0x80_4x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) +static void append_0x80_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset) { switch (offset) { @@ -3251,7 +3213,7 @@ static void append_0x80_4x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u } } -static void append_0x80_8x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset) +static void append_0x80_8x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset) { switch (offset) { @@ -3769,7 +3731,7 @@ static void append_0x80_8x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[ } } -static void append_0x80_1x16 (u32 w[16], const u32 offset) +static void append_0x80_1x16 (u32x w[16], const u32 offset) { switch (offset) { @@ -4031,1226 +3993,7 @@ static void append_0x80_1x16 (u32 w[16], const u32 offset) } } -static void switch_buffer_by_offset_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) -{ - #if defined IS_AMD || defined IS_GENERIC - const int offset_mod_4 = offset & 3; - - const int offset_minus_4 = 4 - offset; - - switch (offset / 4) - { - case 0: - w3[2] = amd_bytealign_S ( 0, w3[1], offset_minus_4); - w3[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w2[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w2[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w2[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w1[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w1[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w1[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w0[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w0[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w0[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); - - if (offset_mod_4 == 0) - { - w0[0] = w0[1]; - w0[1] = w0[2]; - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = 0; - } - - break; - - case 1: - w3[2] = amd_bytealign_S ( 0, w3[0], offset_minus_4); - w3[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w2[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w2[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w1[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w1[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w0[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w0[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w0[1] = w0[2]; - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = 0; - } - - break; - - case 2: - w3[2] = amd_bytealign_S ( 0, w2[3], offset_minus_4); - w3[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w2[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w1[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w0[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = 0; - } - - break; - - case 3: - w3[2] = amd_bytealign_S ( 0, w2[2], offset_minus_4); - w3[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = 0; - } - - break; - - case 4: - w3[2] = amd_bytealign_S ( 0, w2[1], offset_minus_4); - w3[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = 0; - } - - break; - - case 5: - w3[2] = amd_bytealign_S ( 0, w2[0], offset_minus_4); - w3[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = 0; - } - - break; - - case 6: - w3[2] = amd_bytealign_S ( 0, w1[3], offset_minus_4); - w3[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = 0; - } - - break; - - case 7: - w3[2] = amd_bytealign_S ( 0, w1[2], offset_minus_4); - w3[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = 0; - } - - break; - - case 8: - w3[2] = amd_bytealign_S ( 0, w1[1], offset_minus_4); - w3[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = 0; - } - - break; - - case 9: - w3[2] = amd_bytealign_S ( 0, w1[0], offset_minus_4); - w3[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = 0; - } - - break; - - case 10: - w3[2] = amd_bytealign_S ( 0, w0[3], offset_minus_4); - w3[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = 0; - } - - break; - - case 11: - w3[2] = amd_bytealign_S ( 0, w0[2], offset_minus_4); - w3[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = 0; - } - - break; - - case 12: - w3[2] = amd_bytealign_S ( 0, w0[1], offset_minus_4); - w3[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = 0; - } - - break; - - case 13: - w3[2] = amd_bytealign_S ( 0, w0[0], offset_minus_4); - w3[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w3[0] = 0; - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w3[1] = w3[2]; - w3[2] = 0; - } - - break; - } - #endif - - #ifdef IS_NV - const int offset_minus_4 = 4 - (offset % 4); - - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - - switch (offset / 4) - { - case 0: - w3[1] = __byte_perm_S (w3[0], w3[1], selector); - w3[0] = __byte_perm_S (w2[3], w3[0], selector); - w2[3] = __byte_perm_S (w2[2], w2[3], selector); - w2[2] = __byte_perm_S (w2[1], w2[2], selector); - w2[1] = __byte_perm_S (w2[0], w2[1], selector); - w2[0] = __byte_perm_S (w1[3], w2[0], selector); - w1[3] = __byte_perm_S (w1[2], w1[3], selector); - w1[2] = __byte_perm_S (w1[1], w1[2], selector); - w1[1] = __byte_perm_S (w1[0], w1[1], selector); - w1[0] = __byte_perm_S (w0[3], w1[0], selector); - w0[3] = __byte_perm_S (w0[2], w0[3], selector); - w0[2] = __byte_perm_S (w0[1], w0[2], selector); - w0[1] = __byte_perm_S (w0[0], w0[1], selector); - w0[0] = __byte_perm_S ( 0, w0[0], selector); - - break; - - case 1: - w3[1] = __byte_perm_S (w2[3], w3[0], selector); - w3[0] = __byte_perm_S (w2[2], w2[3], selector); - w2[3] = __byte_perm_S (w2[1], w2[2], selector); - w2[2] = __byte_perm_S (w2[0], w2[1], selector); - w2[1] = __byte_perm_S (w1[3], w2[0], selector); - w2[0] = __byte_perm_S (w1[2], w1[3], selector); - w1[3] = __byte_perm_S (w1[1], w1[2], selector); - w1[2] = __byte_perm_S (w1[0], w1[1], selector); - w1[1] = __byte_perm_S (w0[3], w1[0], selector); - w1[0] = __byte_perm_S (w0[2], w0[3], selector); - w0[3] = __byte_perm_S (w0[1], w0[2], selector); - w0[2] = __byte_perm_S (w0[0], w0[1], selector); - w0[1] = __byte_perm_S ( 0, w0[0], selector); - w0[0] = 0; - - break; - - case 2: - w3[1] = __byte_perm_S (w2[2], w2[3], selector); - w3[0] = __byte_perm_S (w2[1], w2[2], selector); - w2[3] = __byte_perm_S (w2[0], w2[1], selector); - w2[2] = __byte_perm_S (w1[3], w2[0], selector); - w2[1] = __byte_perm_S (w1[2], w1[3], selector); - w2[0] = __byte_perm_S (w1[1], w1[2], selector); - w1[3] = __byte_perm_S (w1[0], w1[1], selector); - w1[2] = __byte_perm_S (w0[3], w1[0], selector); - w1[1] = __byte_perm_S (w0[2], w0[3], selector); - w1[0] = __byte_perm_S (w0[1], w0[2], selector); - w0[3] = __byte_perm_S (w0[0], w0[1], selector); - w0[2] = __byte_perm_S ( 0, w0[0], selector); - w0[1] = 0; - w0[0] = 0; - - break; - - case 3: - w3[1] = __byte_perm_S (w2[1], w2[2], selector); - w3[0] = __byte_perm_S (w2[0], w2[1], selector); - w2[3] = __byte_perm_S (w1[3], w2[0], selector); - w2[2] = __byte_perm_S (w1[2], w1[3], selector); - w2[1] = __byte_perm_S (w1[1], w1[2], selector); - w2[0] = __byte_perm_S (w1[0], w1[1], selector); - w1[3] = __byte_perm_S (w0[3], w1[0], selector); - w1[2] = __byte_perm_S (w0[2], w0[3], selector); - w1[1] = __byte_perm_S (w0[1], w0[2], selector); - w1[0] = __byte_perm_S (w0[0], w0[1], selector); - w0[3] = __byte_perm_S ( 0, w0[0], selector); - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - break; - - case 4: - w3[1] = __byte_perm_S (w2[0], w2[1], selector); - w3[0] = __byte_perm_S (w1[3], w2[0], selector); - w2[3] = __byte_perm_S (w1[2], w1[3], selector); - w2[2] = __byte_perm_S (w1[1], w1[2], selector); - w2[1] = __byte_perm_S (w1[0], w1[1], selector); - w2[0] = __byte_perm_S (w0[3], w1[0], selector); - w1[3] = __byte_perm_S (w0[2], w0[3], selector); - w1[2] = __byte_perm_S (w0[1], w0[2], selector); - w1[1] = __byte_perm_S (w0[0], w0[1], selector); - w1[0] = __byte_perm_S ( 0, w0[0], selector); - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - break; - - case 5: - w3[1] = __byte_perm_S (w1[3], w2[0], selector); - w3[0] = __byte_perm_S (w1[2], w1[3], selector); - w2[3] = __byte_perm_S (w1[1], w1[2], selector); - w2[2] = __byte_perm_S (w1[0], w1[1], selector); - w2[1] = __byte_perm_S (w0[3], w1[0], selector); - w2[0] = __byte_perm_S (w0[2], w0[3], selector); - w1[3] = __byte_perm_S (w0[1], w0[2], selector); - w1[2] = __byte_perm_S (w0[0], w0[1], selector); - w1[1] = __byte_perm_S ( 0, w0[0], selector); - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - break; - - case 6: - w3[1] = __byte_perm_S (w1[2], w1[3], selector); - w3[0] = __byte_perm_S (w1[1], w1[2], selector); - w2[3] = __byte_perm_S (w1[0], w1[1], selector); - w2[2] = __byte_perm_S (w0[3], w1[0], selector); - w2[1] = __byte_perm_S (w0[2], w0[3], selector); - w2[0] = __byte_perm_S (w0[1], w0[2], selector); - w1[3] = __byte_perm_S (w0[0], w0[1], selector); - w1[2] = __byte_perm_S ( 0, w0[0], selector); - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - break; - - case 7: - w3[1] = __byte_perm_S (w1[1], w1[2], selector); - w3[0] = __byte_perm_S (w1[0], w1[1], selector); - w2[3] = __byte_perm_S (w0[3], w1[0], selector); - w2[2] = __byte_perm_S (w0[2], w0[3], selector); - w2[1] = __byte_perm_S (w0[1], w0[2], selector); - w2[0] = __byte_perm_S (w0[0], w0[1], selector); - w1[3] = __byte_perm_S ( 0, w0[0], selector); - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - break; - - case 8: - w3[1] = __byte_perm_S (w1[0], w1[1], selector); - w3[0] = __byte_perm_S (w0[3], w1[0], selector); - w2[3] = __byte_perm_S (w0[2], w0[3], selector); - w2[2] = __byte_perm_S (w0[1], w0[2], selector); - w2[1] = __byte_perm_S (w0[0], w0[1], selector); - w2[0] = __byte_perm_S ( 0, w0[0], selector); - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - break; - - case 9: - w3[1] = __byte_perm_S (w0[3], w1[0], selector); - w3[0] = __byte_perm_S (w0[2], w0[3], selector); - w2[3] = __byte_perm_S (w0[1], w0[2], selector); - w2[2] = __byte_perm_S (w0[0], w0[1], selector); - w2[1] = __byte_perm_S ( 0, w0[0], selector); - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - break; - - case 10: - w3[1] = __byte_perm_S (w0[2], w0[3], selector); - w3[0] = __byte_perm_S (w0[1], w0[2], selector); - w2[3] = __byte_perm_S (w0[0], w0[1], selector); - w2[2] = __byte_perm_S ( 0, w0[0], selector); - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - break; - - case 11: - w3[1] = __byte_perm_S (w0[1], w0[2], selector); - w3[0] = __byte_perm_S (w0[0], w0[1], selector); - w2[3] = __byte_perm_S ( 0, w0[0], selector); - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - break; - - case 12: - w3[1] = __byte_perm_S (w0[0], w0[1], selector); - w3[0] = __byte_perm_S ( 0, w0[0], selector); - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - break; - - case 13: - w3[1] = __byte_perm_S ( 0, w0[0], selector); - w3[0] = 0; - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - break; - } - #endif -} - -static void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) -{ - #if defined IS_AMD || defined IS_GENERIC - switch (offset / 4) - { - case 0: - w3[2] = amd_bytealign_S (w3[1], 0, offset); - w3[1] = amd_bytealign_S (w3[0], w3[1], offset); - w3[0] = amd_bytealign_S (w2[3], w3[0], offset); - w2[3] = amd_bytealign_S (w2[2], w2[3], offset); - w2[2] = amd_bytealign_S (w2[1], w2[2], offset); - w2[1] = amd_bytealign_S (w2[0], w2[1], offset); - w2[0] = amd_bytealign_S (w1[3], w2[0], offset); - w1[3] = amd_bytealign_S (w1[2], w1[3], offset); - w1[2] = amd_bytealign_S (w1[1], w1[2], offset); - w1[1] = amd_bytealign_S (w1[0], w1[1], offset); - w1[0] = amd_bytealign_S (w0[3], w1[0], offset); - w0[3] = amd_bytealign_S (w0[2], w0[3], offset); - w0[2] = amd_bytealign_S (w0[1], w0[2], offset); - w0[1] = amd_bytealign_S (w0[0], w0[1], offset); - w0[0] = amd_bytealign_S ( 0, w0[0], offset); - break; - - case 1: - w3[2] = amd_bytealign_S (w3[0], 0, offset); - w3[1] = amd_bytealign_S (w2[3], w3[0], offset); - w3[0] = amd_bytealign_S (w2[2], w2[3], offset); - w2[3] = amd_bytealign_S (w2[1], w2[2], offset); - w2[2] = amd_bytealign_S (w2[0], w2[1], offset); - w2[1] = amd_bytealign_S (w1[3], w2[0], offset); - w2[0] = amd_bytealign_S (w1[2], w1[3], offset); - w1[3] = amd_bytealign_S (w1[1], w1[2], offset); - w1[2] = amd_bytealign_S (w1[0], w1[1], offset); - w1[1] = amd_bytealign_S (w0[3], w1[0], offset); - w1[0] = amd_bytealign_S (w0[2], w0[3], offset); - w0[3] = amd_bytealign_S (w0[1], w0[2], offset); - w0[2] = amd_bytealign_S (w0[0], w0[1], offset); - w0[1] = amd_bytealign_S ( 0, w0[0], offset); - w0[0] = 0; - break; - - case 2: - w3[2] = amd_bytealign_S (w2[3], 0, offset); - w3[1] = amd_bytealign_S (w2[2], w2[3], offset); - w3[0] = amd_bytealign_S (w2[1], w2[2], offset); - w2[3] = amd_bytealign_S (w2[0], w2[1], offset); - w2[2] = amd_bytealign_S (w1[3], w2[0], offset); - w2[1] = amd_bytealign_S (w1[2], w1[3], offset); - w2[0] = amd_bytealign_S (w1[1], w1[2], offset); - w1[3] = amd_bytealign_S (w1[0], w1[1], offset); - w1[2] = amd_bytealign_S (w0[3], w1[0], offset); - w1[1] = amd_bytealign_S (w0[2], w0[3], offset); - w1[0] = amd_bytealign_S (w0[1], w0[2], offset); - w0[3] = amd_bytealign_S (w0[0], w0[1], offset); - w0[2] = amd_bytealign_S ( 0, w0[0], offset); - w0[1] = 0; - w0[0] = 0; - break; - - case 3: - w3[2] = amd_bytealign_S (w2[2], 0, offset); - w3[1] = amd_bytealign_S (w2[1], w2[2], offset); - w3[0] = amd_bytealign_S (w2[0], w2[1], offset); - w2[3] = amd_bytealign_S (w1[3], w2[0], offset); - w2[2] = amd_bytealign_S (w1[2], w1[3], offset); - w2[1] = amd_bytealign_S (w1[1], w1[2], offset); - w2[0] = amd_bytealign_S (w1[0], w1[1], offset); - w1[3] = amd_bytealign_S (w0[3], w1[0], offset); - w1[2] = amd_bytealign_S (w0[2], w0[3], offset); - w1[1] = amd_bytealign_S (w0[1], w0[2], offset); - w1[0] = amd_bytealign_S (w0[0], w0[1], offset); - w0[3] = amd_bytealign_S ( 0, w0[0], offset); - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 4: - w3[2] = amd_bytealign_S (w2[1], 0, offset); - w3[1] = amd_bytealign_S (w2[0], w2[1], offset); - w3[0] = amd_bytealign_S (w1[3], w2[0], offset); - w2[3] = amd_bytealign_S (w1[2], w1[3], offset); - w2[2] = amd_bytealign_S (w1[1], w1[2], offset); - w2[1] = amd_bytealign_S (w1[0], w1[1], offset); - w2[0] = amd_bytealign_S (w0[3], w1[0], offset); - w1[3] = amd_bytealign_S (w0[2], w0[3], offset); - w1[2] = amd_bytealign_S (w0[1], w0[2], offset); - w1[1] = amd_bytealign_S (w0[0], w0[1], offset); - w1[0] = amd_bytealign_S ( 0, w0[0], offset); - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 5: - w3[2] = amd_bytealign_S (w2[0], 0, offset); - w3[1] = amd_bytealign_S (w1[3], w2[0], offset); - w3[0] = amd_bytealign_S (w1[2], w1[3], offset); - w2[3] = amd_bytealign_S (w1[1], w1[2], offset); - w2[2] = amd_bytealign_S (w1[0], w1[1], offset); - w2[1] = amd_bytealign_S (w0[3], w1[0], offset); - w2[0] = amd_bytealign_S (w0[2], w0[3], offset); - w1[3] = amd_bytealign_S (w0[1], w0[2], offset); - w1[2] = amd_bytealign_S (w0[0], w0[1], offset); - w1[1] = amd_bytealign_S ( 0, w0[0], offset); - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 6: - w3[2] = amd_bytealign_S (w1[3], 0, offset); - w3[1] = amd_bytealign_S (w1[2], w1[3], offset); - w3[0] = amd_bytealign_S (w1[1], w1[2], offset); - w2[3] = amd_bytealign_S (w1[0], w1[1], offset); - w2[2] = amd_bytealign_S (w0[3], w1[0], offset); - w2[1] = amd_bytealign_S (w0[2], w0[3], offset); - w2[0] = amd_bytealign_S (w0[1], w0[2], offset); - w1[3] = amd_bytealign_S (w0[0], w0[1], offset); - w1[2] = amd_bytealign_S ( 0, w0[0], offset); - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 7: - w3[2] = amd_bytealign_S (w1[2], 0, offset); - w3[1] = amd_bytealign_S (w1[1], w1[2], offset); - w3[0] = amd_bytealign_S (w1[0], w1[1], offset); - w2[3] = amd_bytealign_S (w0[3], w1[0], offset); - w2[2] = amd_bytealign_S (w0[2], w0[3], offset); - w2[1] = amd_bytealign_S (w0[1], w0[2], offset); - w2[0] = amd_bytealign_S (w0[0], w0[1], offset); - w1[3] = amd_bytealign_S ( 0, w0[0], offset); - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 8: - w3[2] = amd_bytealign_S (w1[1], 0, offset); - w3[1] = amd_bytealign_S (w1[0], w1[1], offset); - w3[0] = amd_bytealign_S (w0[3], w1[0], offset); - w2[3] = amd_bytealign_S (w0[2], w0[3], offset); - w2[2] = amd_bytealign_S (w0[1], w0[2], offset); - w2[1] = amd_bytealign_S (w0[0], w0[1], offset); - w2[0] = amd_bytealign_S ( 0, w0[0], offset); - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 9: - w3[2] = amd_bytealign_S (w1[0], 0, offset); - w3[1] = amd_bytealign_S (w0[3], w1[0], offset); - w3[0] = amd_bytealign_S (w0[2], w0[3], offset); - w2[3] = amd_bytealign_S (w0[1], w0[2], offset); - w2[2] = amd_bytealign_S (w0[0], w0[1], offset); - w2[1] = amd_bytealign_S ( 0, w0[0], offset); - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 10: - w3[2] = amd_bytealign_S (w0[3], 0, offset); - w3[1] = amd_bytealign_S (w0[2], w0[3], offset); - w3[0] = amd_bytealign_S (w0[1], w0[2], offset); - w2[3] = amd_bytealign_S (w0[0], w0[1], offset); - w2[2] = amd_bytealign_S ( 0, w0[0], offset); - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 11: - w3[2] = amd_bytealign_S (w0[2], 0, offset); - w3[1] = amd_bytealign_S (w0[1], w0[2], offset); - w3[0] = amd_bytealign_S (w0[0], w0[1], offset); - w2[3] = amd_bytealign_S ( 0, w0[0], offset); - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 12: - w3[2] = amd_bytealign_S (w0[1], 0, offset); - w3[1] = amd_bytealign_S (w0[0], w0[1], offset); - w3[0] = amd_bytealign_S ( 0, w0[0], offset); - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 13: - w3[2] = amd_bytealign_S (w0[0], 0, offset); - w3[1] = amd_bytealign_S ( 0, w0[0], offset); - w3[0] = 0; - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - } - #endif - - #ifdef IS_NV - const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; - - switch (offset / 4) - { - case 0: - w3[1] = __byte_perm_S (w3[1], w3[0], selector); - w3[0] = __byte_perm_S (w3[0], w2[3], selector); - w2[3] = __byte_perm_S (w2[3], w2[2], selector); - w2[2] = __byte_perm_S (w2[2], w2[1], selector); - w2[1] = __byte_perm_S (w2[1], w2[0], selector); - w2[0] = __byte_perm_S (w2[0], w1[3], selector); - w1[3] = __byte_perm_S (w1[3], w1[2], selector); - w1[2] = __byte_perm_S (w1[2], w1[1], selector); - w1[1] = __byte_perm_S (w1[1], w1[0], selector); - w1[0] = __byte_perm_S (w1[0], w0[3], selector); - w0[3] = __byte_perm_S (w0[3], w0[2], selector); - w0[2] = __byte_perm_S (w0[2], w0[1], selector); - w0[1] = __byte_perm_S (w0[1], w0[0], selector); - w0[0] = __byte_perm_S (w0[0], 0, selector); - break; - - case 1: - w3[1] = __byte_perm_S (w3[0], w2[3], selector); - w3[0] = __byte_perm_S (w2[3], w2[2], selector); - w2[3] = __byte_perm_S (w2[2], w2[1], selector); - w2[2] = __byte_perm_S (w2[1], w2[0], selector); - w2[1] = __byte_perm_S (w2[0], w1[3], selector); - w2[0] = __byte_perm_S (w1[3], w1[2], selector); - w1[3] = __byte_perm_S (w1[2], w1[1], selector); - w1[2] = __byte_perm_S (w1[1], w1[0], selector); - w1[1] = __byte_perm_S (w1[0], w0[3], selector); - w1[0] = __byte_perm_S (w0[3], w0[2], selector); - w0[3] = __byte_perm_S (w0[2], w0[1], selector); - w0[2] = __byte_perm_S (w0[1], w0[0], selector); - w0[1] = __byte_perm_S (w0[0], 0, selector); - w0[0] = 0; - break; - - case 2: - w3[1] = __byte_perm_S (w2[3], w2[2], selector); - w3[0] = __byte_perm_S (w2[2], w2[1], selector); - w2[3] = __byte_perm_S (w2[1], w2[0], selector); - w2[2] = __byte_perm_S (w2[0], w1[3], selector); - w2[1] = __byte_perm_S (w1[3], w1[2], selector); - w2[0] = __byte_perm_S (w1[2], w1[1], selector); - w1[3] = __byte_perm_S (w1[1], w1[0], selector); - w1[2] = __byte_perm_S (w1[0], w0[3], selector); - w1[1] = __byte_perm_S (w0[3], w0[2], selector); - w1[0] = __byte_perm_S (w0[2], w0[1], selector); - w0[3] = __byte_perm_S (w0[1], w0[0], selector); - w0[2] = __byte_perm_S (w0[0], 0, selector); - w0[1] = 0; - w0[0] = 0; - break; - - case 3: - w3[1] = __byte_perm_S (w2[2], w2[1], selector); - w3[0] = __byte_perm_S (w2[1], w2[0], selector); - w2[3] = __byte_perm_S (w2[0], w1[3], selector); - w2[2] = __byte_perm_S (w1[3], w1[2], selector); - w2[1] = __byte_perm_S (w1[2], w1[1], selector); - w2[0] = __byte_perm_S (w1[1], w1[0], selector); - w1[3] = __byte_perm_S (w1[0], w0[3], selector); - w1[2] = __byte_perm_S (w0[3], w0[2], selector); - w1[1] = __byte_perm_S (w0[2], w0[1], selector); - w1[0] = __byte_perm_S (w0[1], w0[0], selector); - w0[3] = __byte_perm_S (w0[0], 0, selector); - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 4: - w3[1] = __byte_perm_S (w2[1], w2[0], selector); - w3[0] = __byte_perm_S (w2[0], w1[3], selector); - w2[3] = __byte_perm_S (w1[3], w1[2], selector); - w2[2] = __byte_perm_S (w1[2], w1[1], selector); - w2[1] = __byte_perm_S (w1[1], w1[0], selector); - w2[0] = __byte_perm_S (w1[0], w0[3], selector); - w1[3] = __byte_perm_S (w0[3], w0[2], selector); - w1[2] = __byte_perm_S (w0[2], w0[1], selector); - w1[1] = __byte_perm_S (w0[1], w0[0], selector); - w1[0] = __byte_perm_S (w0[0], 0, selector); - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 5: - w3[1] = __byte_perm_S (w2[0], w1[3], selector); - w3[0] = __byte_perm_S (w1[3], w1[2], selector); - w2[3] = __byte_perm_S (w1[2], w1[1], selector); - w2[2] = __byte_perm_S (w1[1], w1[0], selector); - w2[1] = __byte_perm_S (w1[0], w0[3], selector); - w2[0] = __byte_perm_S (w0[3], w0[2], selector); - w1[3] = __byte_perm_S (w0[2], w0[1], selector); - w1[2] = __byte_perm_S (w0[1], w0[0], selector); - w1[1] = __byte_perm_S (w0[0], 0, selector); - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 6: - w3[1] = __byte_perm_S (w1[3], w1[2], selector); - w3[0] = __byte_perm_S (w1[2], w1[1], selector); - w2[3] = __byte_perm_S (w1[1], w1[0], selector); - w2[2] = __byte_perm_S (w1[0], w0[3], selector); - w2[1] = __byte_perm_S (w0[3], w0[2], selector); - w2[0] = __byte_perm_S (w0[2], w0[1], selector); - w1[3] = __byte_perm_S (w0[1], w0[0], selector); - w1[2] = __byte_perm_S (w0[0], 0, selector); - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 7: - w3[1] = __byte_perm_S (w1[2], w1[1], selector); - w3[0] = __byte_perm_S (w1[1], w1[0], selector); - w2[3] = __byte_perm_S (w1[0], w0[3], selector); - w2[2] = __byte_perm_S (w0[3], w0[2], selector); - w2[1] = __byte_perm_S (w0[2], w0[1], selector); - w2[0] = __byte_perm_S (w0[1], w0[0], selector); - w1[3] = __byte_perm_S (w0[0], 0, selector); - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 8: - w3[1] = __byte_perm_S (w1[1], w1[0], selector); - w3[0] = __byte_perm_S (w1[0], w0[3], selector); - w2[3] = __byte_perm_S (w0[3], w0[2], selector); - w2[2] = __byte_perm_S (w0[2], w0[1], selector); - w2[1] = __byte_perm_S (w0[1], w0[0], selector); - w2[0] = __byte_perm_S (w0[0], 0, selector); - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 9: - w3[1] = __byte_perm_S (w1[0], w0[3], selector); - w3[0] = __byte_perm_S (w0[3], w0[2], selector); - w2[3] = __byte_perm_S (w0[2], w0[1], selector); - w2[2] = __byte_perm_S (w0[1], w0[0], selector); - w2[1] = __byte_perm_S (w0[0], 0, selector); - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 10: - w3[1] = __byte_perm_S (w0[3], w0[2], selector); - w3[0] = __byte_perm_S (w0[2], w0[1], selector); - w2[3] = __byte_perm_S (w0[1], w0[0], selector); - w2[2] = __byte_perm_S (w0[0], 0, selector); - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 11: - w3[1] = __byte_perm_S (w0[2], w0[1], selector); - w3[0] = __byte_perm_S (w0[1], w0[0], selector); - w2[3] = __byte_perm_S (w0[0], 0, selector); - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 12: - w3[1] = __byte_perm_S (w0[1], w0[0], selector); - w3[0] = __byte_perm_S (w0[0], 0, selector); - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 13: - w3[1] = __byte_perm_S (w0[0], 0, selector); - w3[0] = 0; - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - } - #endif -} - -static void switch_buffer_by_offset (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset) +static void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset) { #if defined IS_AMD || defined IS_GENERIC const int offset_mod_4 = offset & 3; @@ -6469,2765 +5212,2284 @@ static void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x #endif } -/* not needed anymore? - -// before: append_0x80_2_be -static void append_0x80_2x4_be (u32 w0[4], u32 w1[4], const u32 offset) +static void overwrite_at_le (u32x sw[16], const u32x w0, const u32 salt_len) { - switch (offset) + #if defined cl_amd_media_ops + switch (salt_len) { - case 0: - w0[0] |= 0x80000000; - break; + case 0: sw[0] = w0; + break; + case 1: sw[0] = amd_bytealign (w0, sw[0] << 24, 3); + sw[1] = amd_bytealign (sw[1] >> 8, w0, 3); + break; + case 2: sw[0] = amd_bytealign (w0, sw[0] << 16, 2); + sw[1] = amd_bytealign (sw[1] >> 16, w0, 2); + break; + case 3: sw[0] = amd_bytealign (w0, sw[0] << 8, 1); + sw[1] = amd_bytealign (sw[1] >> 24, w0, 1); + break; + case 4: sw[1] = w0; + break; + case 5: sw[1] = amd_bytealign (w0, sw[1] << 24, 3); + sw[2] = amd_bytealign (sw[2] >> 8, w0, 3); + break; + case 6: sw[1] = amd_bytealign (w0, sw[1] << 16, 2); + sw[2] = amd_bytealign (sw[2] >> 16, w0, 2); + break; + case 7: sw[1] = amd_bytealign (w0, sw[1] << 8, 1); + sw[2] = amd_bytealign (sw[2] >> 24, w0, 1); + break; + case 8: sw[2] = w0; + break; + case 9: sw[2] = amd_bytealign (w0, sw[2] << 24, 3); + sw[3] = amd_bytealign (sw[3] >> 8, w0, 3); + break; + case 10: sw[2] = amd_bytealign (w0, sw[2] << 16, 2); + sw[3] = amd_bytealign (sw[3] >> 16, w0, 2); + break; + case 11: sw[2] = amd_bytealign (w0, sw[2] << 8, 1); + sw[3] = amd_bytealign (sw[3] >> 24, w0, 1); + break; + case 12: sw[3] = w0; + break; + case 13: sw[3] = amd_bytealign (w0, sw[3] << 24, 3); + sw[4] = amd_bytealign (sw[4] >> 8, w0, 3); + break; + case 14: sw[3] = amd_bytealign (w0, sw[3] << 16, 2); + sw[4] = amd_bytealign (sw[4] >> 16, w0, 2); + break; + case 15: sw[3] = amd_bytealign (w0, sw[3] << 8, 1); + sw[4] = amd_bytealign (sw[4] >> 24, w0, 1); + break; + case 16: sw[4] = w0; + break; + case 17: sw[4] = amd_bytealign (w0, sw[4] << 24, 3); + sw[5] = amd_bytealign (sw[5] >> 8, w0, 3); + break; + case 18: sw[4] = amd_bytealign (w0, sw[4] << 16, 2); + sw[5] = amd_bytealign (sw[5] >> 16, w0, 2); + break; + case 19: sw[4] = amd_bytealign (w0, sw[4] << 8, 1); + sw[5] = amd_bytealign (sw[5] >> 24, w0, 1); + break; + case 20: sw[5] = w0; + break; + case 21: sw[5] = amd_bytealign (w0, sw[5] << 24, 3); + sw[6] = amd_bytealign (sw[6] >> 8, w0, 3); + break; + case 22: sw[5] = amd_bytealign (w0, sw[5] << 16, 2); + sw[6] = amd_bytealign (sw[6] >> 16, w0, 2); + break; + case 23: sw[5] = amd_bytealign (w0, sw[5] << 8, 1); + sw[6] = amd_bytealign (sw[6] >> 24, w0, 1); + break; + case 24: sw[6] = w0; + break; + case 25: sw[6] = amd_bytealign (w0, sw[6] << 24, 3); + sw[7] = amd_bytealign (sw[7] >> 8, w0, 3); + break; + case 26: sw[6] = amd_bytealign (w0, sw[6] << 16, 2); + sw[7] = amd_bytealign (sw[7] >> 16, w0, 2); + break; + case 27: sw[6] = amd_bytealign (w0, sw[6] << 8, 1); + sw[7] = amd_bytealign (sw[7] >> 24, w0, 1); + break; + case 28: sw[7] = w0; + break; + case 29: sw[7] = amd_bytealign (w0, sw[7] << 24, 3); + sw[8] = amd_bytealign (sw[8] >> 8, w0, 3); + break; + case 30: sw[7] = amd_bytealign (w0, sw[7] << 16, 2); + sw[8] = amd_bytealign (sw[8] >> 16, w0, 2); + break; + case 31: sw[7] = amd_bytealign (w0, sw[7] << 8, 1); + sw[8] = amd_bytealign (sw[8] >> 24, w0, 1); + break; + } + #else + switch (salt_len) + { + case 0: sw[0] = w0; + break; + case 1: sw[0] = (sw[0] & 0x000000ff) | (w0 << 8); + sw[1] = (sw[1] & 0xffffff00) | (w0 >> 24); + break; + case 2: sw[0] = (sw[0] & 0x0000ffff) | (w0 << 16); + sw[1] = (sw[1] & 0xffff0000) | (w0 >> 16); + break; + case 3: sw[0] = (sw[0] & 0x00ffffff) | (w0 << 24); + sw[1] = (sw[1] & 0xff000000) | (w0 >> 8); + break; + case 4: sw[1] = w0; + break; + case 5: sw[1] = (sw[1] & 0x000000ff) | (w0 << 8); + sw[2] = (sw[2] & 0xffffff00) | (w0 >> 24); + break; + case 6: sw[1] = (sw[1] & 0x0000ffff) | (w0 << 16); + sw[2] = (sw[2] & 0xffff0000) | (w0 >> 16); + break; + case 7: sw[1] = (sw[1] & 0x00ffffff) | (w0 << 24); + sw[2] = (sw[2] & 0xff000000) | (w0 >> 8); + break; + case 8: sw[2] = w0; + break; + case 9: sw[2] = (sw[2] & 0x000000ff) | (w0 << 8); + sw[3] = (sw[3] & 0xffffff00) | (w0 >> 24); + break; + case 10: sw[2] = (sw[2] & 0x0000ffff) | (w0 << 16); + sw[3] = (sw[3] & 0xffff0000) | (w0 >> 16); + break; + case 11: sw[2] = (sw[2] & 0x00ffffff) | (w0 << 24); + sw[3] = (sw[3] & 0xff000000) | (w0 >> 8); + break; + case 12: sw[3] = w0; + break; + case 13: sw[3] = (sw[3] & 0x000000ff) | (w0 << 8); + sw[4] = (sw[4] & 0xffffff00) | (w0 >> 24); + break; + case 14: sw[3] = (sw[3] & 0x0000ffff) | (w0 << 16); + sw[4] = (sw[4] & 0xffff0000) | (w0 >> 16); + break; + case 15: sw[3] = (sw[3] & 0x00ffffff) | (w0 << 24); + sw[4] = (sw[4] & 0xff000000) | (w0 >> 8); + break; + case 16: sw[4] = w0; + break; + case 17: sw[4] = (sw[4] & 0x000000ff) | (w0 << 8); + sw[5] = (sw[5] & 0xffffff00) | (w0 >> 24); + break; + case 18: sw[4] = (sw[4] & 0x0000ffff) | (w0 << 16); + sw[5] = (sw[5] & 0xffff0000) | (w0 >> 16); + break; + case 19: sw[4] = (sw[4] & 0x00ffffff) | (w0 << 24); + sw[5] = (sw[5] & 0xff000000) | (w0 >> 8); + break; + case 20: sw[5] = w0; + break; + case 21: sw[5] = (sw[5] & 0x000000ff) | (w0 << 8); + sw[6] = (sw[6] & 0xffffff00) | (w0 >> 24); + break; + case 22: sw[5] = (sw[5] & 0x0000ffff) | (w0 << 16); + sw[6] = (sw[6] & 0xffff0000) | (w0 >> 16); + break; + case 23: sw[5] = (sw[5] & 0x00ffffff) | (w0 << 24); + sw[6] = (sw[6] & 0xff000000) | (w0 >> 8); + break; + case 24: sw[6] = w0; + break; + case 25: sw[6] = (sw[6] & 0x000000ff) | (w0 << 8); + sw[7] = (sw[7] & 0xffffff00) | (w0 >> 24); + break; + case 26: sw[6] = (sw[6] & 0x0000ffff) | (w0 << 16); + sw[7] = (sw[7] & 0xffff0000) | (w0 >> 16); + break; + case 27: sw[6] = (sw[6] & 0x00ffffff) | (w0 << 24); + sw[7] = (sw[7] & 0xff000000) | (w0 >> 8); + break; + case 28: sw[7] = w0; + break; + case 29: sw[7] = (sw[7] & 0x000000ff) | (w0 << 8); + sw[8] = (sw[8] & 0xffffff00) | (w0 >> 24); + break; + case 30: sw[7] = (sw[7] & 0x0000ffff) | (w0 << 16); + sw[8] = (sw[8] & 0xffff0000) | (w0 >> 16); + break; + case 31: sw[7] = (sw[7] & 0x00ffffff) | (w0 << 24); + sw[8] = (sw[8] & 0xff000000) | (w0 >> 8); + break; + } + #endif +} - case 1: - w0[0] |= 0x800000; - break; +static void overwrite_at_be (u32x sw[16], const u32x w0, const u32 salt_len) +{ + // would be nice to have optimization based on amd_bytealign as with _le counterpart - case 2: - w0[0] |= 0x8000; - break; - - case 3: - w0[0] |= 0x80; - break; - - case 4: - w0[1] |= 0x80000000; - break; - - case 5: - w0[1] |= 0x800000; - break; - - case 6: - w0[1] |= 0x8000; - break; - - case 7: - w0[1] |= 0x80; - break; - - case 8: - w0[2] |= 0x80000000; - break; - - case 9: - w0[2] |= 0x800000; - break; - - case 10: - w0[2] |= 0x8000; - break; - - case 11: - w0[2] |= 0x80; - break; - - case 12: - w0[3] |= 0x80000000; - break; - - case 13: - w0[3] |= 0x800000; - break; - - case 14: - w0[3] |= 0x8000; - break; - - case 15: - w0[3] |= 0x80; - break; - - case 16: - w1[0] |= 0x80000000; - break; - - case 17: - w1[0] |= 0x800000; - break; - - case 18: - w1[0] |= 0x8000; - break; - - case 19: - w1[0] |= 0x80; - break; - - case 20: - w1[1] |= 0x80000000; - break; - - case 21: - w1[1] |= 0x800000; - break; - - case 22: - w1[1] |= 0x8000; - break; - - case 23: - w1[1] |= 0x80; - break; - - case 24: - w1[2] |= 0x80000000; - break; - - case 25: - w1[2] |= 0x800000; - break; - - case 26: - w1[2] |= 0x8000; - break; - - case 27: - w1[2] |= 0x80; - break; - - case 28: - w1[3] |= 0x80000000; - break; - - case 29: - w1[3] |= 0x800000; - break; - - case 30: - w1[3] |= 0x8000; - break; - - case 31: - w1[3] |= 0x80; - break; + switch (salt_len) + { + case 0: sw[0] = w0; + break; + case 1: sw[0] = (sw[0] & 0xff000000) | (w0 >> 8); + sw[1] = (sw[1] & 0x00ffffff) | (w0 << 24); + break; + case 2: sw[0] = (sw[0] & 0xffff0000) | (w0 >> 16); + sw[1] = (sw[1] & 0x0000ffff) | (w0 << 16); + break; + case 3: sw[0] = (sw[0] & 0xffffff00) | (w0 >> 24); + sw[1] = (sw[1] & 0x000000ff) | (w0 << 8); + break; + case 4: sw[1] = w0; + break; + case 5: sw[1] = (sw[1] & 0xff000000) | (w0 >> 8); + sw[2] = (sw[2] & 0x00ffffff) | (w0 << 24); + break; + case 6: sw[1] = (sw[1] & 0xffff0000) | (w0 >> 16); + sw[2] = (sw[2] & 0x0000ffff) | (w0 << 16); + break; + case 7: sw[1] = (sw[1] & 0xffffff00) | (w0 >> 24); + sw[2] = (sw[2] & 0x000000ff) | (w0 << 8); + break; + case 8: sw[2] = w0; + break; + case 9: sw[2] = (sw[2] & 0xff000000) | (w0 >> 8); + sw[3] = (sw[3] & 0x00ffffff) | (w0 << 24); + break; + case 10: sw[2] = (sw[2] & 0xffff0000) | (w0 >> 16); + sw[3] = (sw[3] & 0x0000ffff) | (w0 << 16); + break; + case 11: sw[2] = (sw[2] & 0xffffff00) | (w0 >> 24); + sw[3] = (sw[3] & 0x000000ff) | (w0 << 8); + break; + case 12: sw[3] = w0; + break; + case 13: sw[3] = (sw[3] & 0xff000000) | (w0 >> 8); + sw[4] = (sw[4] & 0x00ffffff) | (w0 << 24); + break; + case 14: sw[3] = (sw[3] & 0xffff0000) | (w0 >> 16); + sw[4] = (sw[4] & 0x0000ffff) | (w0 << 16); + break; + case 15: sw[3] = (sw[3] & 0xffffff00) | (w0 >> 24); + sw[4] = (sw[4] & 0x000000ff) | (w0 << 8); + break; + case 16: sw[4] = w0; + break; + case 17: sw[4] = (sw[4] & 0xff000000) | (w0 >> 8); + sw[5] = (sw[5] & 0x00ffffff) | (w0 << 24); + break; + case 18: sw[4] = (sw[4] & 0xffff0000) | (w0 >> 16); + sw[5] = (sw[5] & 0x0000ffff) | (w0 << 16); + break; + case 19: sw[4] = (sw[4] & 0xffffff00) | (w0 >> 24); + sw[5] = (sw[5] & 0x000000ff) | (w0 << 8); + break; + case 20: sw[5] = w0; + break; + case 21: sw[5] = (sw[5] & 0xff000000) | (w0 >> 8); + sw[6] = (sw[6] & 0x00ffffff) | (w0 << 24); + break; + case 22: sw[5] = (sw[5] & 0xffff0000) | (w0 >> 16); + sw[6] = (sw[6] & 0x0000ffff) | (w0 << 16); + break; + case 23: sw[5] = (sw[5] & 0xffffff00) | (w0 >> 24); + sw[6] = (sw[6] & 0x000000ff) | (w0 << 8); + break; + case 24: sw[6] = w0; + break; + case 25: sw[6] = (sw[6] & 0xff000000) | (w0 >> 8); + sw[7] = (sw[7] & 0x00ffffff) | (w0 << 24); + break; + case 26: sw[6] = (sw[6] & 0xffff0000) | (w0 >> 16); + sw[7] = (sw[7] & 0x0000ffff) | (w0 << 16); + break; + case 27: sw[6] = (sw[6] & 0xffffff00) | (w0 >> 24); + sw[7] = (sw[7] & 0x000000ff) | (w0 << 8); + break; + case 28: sw[7] = w0; + break; + case 29: sw[7] = (sw[7] & 0xff000000) | (w0 >> 8); + sw[8] = (sw[8] & 0x00ffffff) | (w0 << 24); + break; + case 30: sw[7] = (sw[7] & 0xffff0000) | (w0 >> 16); + sw[8] = (sw[8] & 0x0000ffff) | (w0 << 16); + break; + case 31: sw[7] = (sw[7] & 0xffffff00) | (w0 >> 24); + sw[8] = (sw[8] & 0x000000ff) | (w0 << 8); + break; } } -// before: append_0x80_8 -static void append_0x80_1x32 (u32 w[32], const u32 offset) +/** + * vector functions as scalar (for outer loop usage) + */ + +static void append_0x80_1x4_S (u32 w0[4], const u32 offset) { switch (offset) { case 0: - w[ 0] = 0x80; + w0[0] = 0x80; break; case 1: - w[ 0] = w[ 0] | 0x8000; + w0[0] = w0[0] | 0x8000; break; case 2: - w[ 0] = w[ 0] | 0x800000; + w0[0] = w0[0] | 0x800000; break; case 3: - w[ 0] = w[ 0] | 0x80000000; + w0[0] = w0[0] | 0x80000000; break; case 4: - w[ 1] = 0x80; + w0[1] = 0x80; break; case 5: - w[ 1] = w[ 1] | 0x8000; + w0[1] = w0[1] | 0x8000; break; case 6: - w[ 1] = w[ 1] | 0x800000; + w0[1] = w0[1] | 0x800000; break; case 7: - w[ 1] = w[ 1] | 0x80000000; + w0[1] = w0[1] | 0x80000000; break; case 8: - w[ 2] = 0x80; + w0[2] = 0x80; break; case 9: - w[ 2] = w[ 2] | 0x8000; + w0[2] = w0[2] | 0x8000; break; case 10: - w[ 2] = w[ 2] | 0x800000; + w0[2] = w0[2] | 0x800000; break; case 11: - w[ 2] = w[ 2] | 0x80000000; + w0[2] = w0[2] | 0x80000000; break; case 12: - w[ 3] = 0x80; + w0[3] = 0x80; break; case 13: - w[ 3] = w[ 3] | 0x8000; + w0[3] = w0[3] | 0x8000; break; case 14: - w[ 3] = w[ 3] | 0x800000; + w0[3] = w0[3] | 0x800000; break; case 15: - w[ 3] = w[ 3] | 0x80000000; + w0[3] = w0[3] | 0x80000000; + break; + } +} + +static void append_0x80_2x4_S (u32 w0[4], u32 w1[4], const u32 offset) +{ + switch (offset) + { + case 0: + w0[0] = 0x80; + break; + + case 1: + w0[0] = w0[0] | 0x8000; + break; + + case 2: + w0[0] = w0[0] | 0x800000; + break; + + case 3: + w0[0] = w0[0] | 0x80000000; + break; + + case 4: + w0[1] = 0x80; + break; + + case 5: + w0[1] = w0[1] | 0x8000; + break; + + case 6: + w0[1] = w0[1] | 0x800000; + break; + + case 7: + w0[1] = w0[1] | 0x80000000; + break; + + case 8: + w0[2] = 0x80; + break; + + case 9: + w0[2] = w0[2] | 0x8000; + break; + + case 10: + w0[2] = w0[2] | 0x800000; + break; + + case 11: + w0[2] = w0[2] | 0x80000000; + break; + + case 12: + w0[3] = 0x80; + break; + + case 13: + w0[3] = w0[3] | 0x8000; + break; + + case 14: + w0[3] = w0[3] | 0x800000; + break; + + case 15: + w0[3] = w0[3] | 0x80000000; break; case 16: - w[ 4] = 0x80; + w1[0] = 0x80; break; case 17: - w[ 4] = w[ 4] | 0x8000; + w1[0] = w1[0] | 0x8000; break; case 18: - w[ 4] = w[ 4] | 0x800000; + w1[0] = w1[0] | 0x800000; break; case 19: - w[ 4] = w[ 4] | 0x80000000; + w1[0] = w1[0] | 0x80000000; break; case 20: - w[ 5] = 0x80; + w1[1] = 0x80; break; case 21: - w[ 5] = w[ 5] | 0x8000; + w1[1] = w1[1] | 0x8000; break; case 22: - w[ 5] = w[ 5] | 0x800000; + w1[1] = w1[1] | 0x800000; break; case 23: - w[ 5] = w[ 5] | 0x80000000; + w1[1] = w1[1] | 0x80000000; break; case 24: - w[ 6] = 0x80; + w1[2] = 0x80; break; case 25: - w[ 6] = w[ 6] | 0x8000; + w1[2] = w1[2] | 0x8000; break; case 26: - w[ 6] = w[ 6] | 0x800000; + w1[2] = w1[2] | 0x800000; break; case 27: - w[ 6] = w[ 6] | 0x80000000; + w1[2] = w1[2] | 0x80000000; break; case 28: - w[ 7] = 0x80; + w1[3] = 0x80; break; case 29: - w[ 7] = w[ 7] | 0x8000; + w1[3] = w1[3] | 0x8000; break; case 30: - w[ 7] = w[ 7] | 0x800000; + w1[3] = w1[3] | 0x800000; break; case 31: - w[ 7] = w[ 7] | 0x80000000; + w1[3] = w1[3] | 0x80000000; + break; + } +} + +static void append_0x80_3x4_S (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset) +{ + switch (offset) + { + case 0: + w0[0] = 0x80; + break; + + case 1: + w0[0] = w0[0] | 0x8000; + break; + + case 2: + w0[0] = w0[0] | 0x800000; + break; + + case 3: + w0[0] = w0[0] | 0x80000000; + break; + + case 4: + w0[1] = 0x80; + break; + + case 5: + w0[1] = w0[1] | 0x8000; + break; + + case 6: + w0[1] = w0[1] | 0x800000; + break; + + case 7: + w0[1] = w0[1] | 0x80000000; + break; + + case 8: + w0[2] = 0x80; + break; + + case 9: + w0[2] = w0[2] | 0x8000; + break; + + case 10: + w0[2] = w0[2] | 0x800000; + break; + + case 11: + w0[2] = w0[2] | 0x80000000; + break; + + case 12: + w0[3] = 0x80; + break; + + case 13: + w0[3] = w0[3] | 0x8000; + break; + + case 14: + w0[3] = w0[3] | 0x800000; + break; + + case 15: + w0[3] = w0[3] | 0x80000000; + break; + + case 16: + w1[0] = 0x80; + break; + + case 17: + w1[0] = w1[0] | 0x8000; + break; + + case 18: + w1[0] = w1[0] | 0x800000; + break; + + case 19: + w1[0] = w1[0] | 0x80000000; + break; + + case 20: + w1[1] = 0x80; + break; + + case 21: + w1[1] = w1[1] | 0x8000; + break; + + case 22: + w1[1] = w1[1] | 0x800000; + break; + + case 23: + w1[1] = w1[1] | 0x80000000; + break; + + case 24: + w1[2] = 0x80; + break; + + case 25: + w1[2] = w1[2] | 0x8000; + break; + + case 26: + w1[2] = w1[2] | 0x800000; + break; + + case 27: + w1[2] = w1[2] | 0x80000000; + break; + + case 28: + w1[3] = 0x80; + break; + + case 29: + w1[3] = w1[3] | 0x8000; + break; + + case 30: + w1[3] = w1[3] | 0x800000; + break; + + case 31: + w1[3] = w1[3] | 0x80000000; break; case 32: - w[ 8] = 0x80; + w2[0] = 0x80; break; case 33: - w[ 8] = w[ 8] | 0x8000; + w2[0] = w2[0] | 0x8000; break; case 34: - w[ 8] = w[ 8] | 0x800000; + w2[0] = w2[0] | 0x800000; break; case 35: - w[ 8] = w[ 8] | 0x80000000; + w2[0] = w2[0] | 0x80000000; break; case 36: - w[ 9] = 0x80; + w2[1] = 0x80; break; case 37: - w[ 9] = w[ 9] | 0x8000; + w2[1] = w2[1] | 0x8000; break; case 38: - w[ 9] = w[ 9] | 0x800000; + w2[1] = w2[1] | 0x800000; break; case 39: - w[ 9] = w[ 9] | 0x80000000; + w2[1] = w2[1] | 0x80000000; break; case 40: - w[10] = 0x80; + w2[2] = 0x80; break; case 41: - w[10] = w[10] | 0x8000; + w2[2] = w2[2] | 0x8000; break; case 42: - w[10] = w[10] | 0x800000; + w2[2] = w2[2] | 0x800000; break; case 43: - w[10] = w[10] | 0x80000000; + w2[2] = w2[2] | 0x80000000; break; case 44: - w[11] = 0x80; + w2[3] = 0x80; break; case 45: - w[11] = w[11] | 0x8000; + w2[3] = w2[3] | 0x8000; break; case 46: - w[11] = w[11] | 0x800000; + w2[3] = w2[3] | 0x800000; break; case 47: - w[11] = w[11] | 0x80000000; + w2[3] = w2[3] | 0x80000000; + break; + } +} + +static void append_0x80_4x4_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) +{ + switch (offset) + { + case 0: + w0[0] = 0x80; + break; + + case 1: + w0[0] = w0[0] | 0x8000; + break; + + case 2: + w0[0] = w0[0] | 0x800000; + break; + + case 3: + w0[0] = w0[0] | 0x80000000; + break; + + case 4: + w0[1] = 0x80; + break; + + case 5: + w0[1] = w0[1] | 0x8000; + break; + + case 6: + w0[1] = w0[1] | 0x800000; + break; + + case 7: + w0[1] = w0[1] | 0x80000000; + break; + + case 8: + w0[2] = 0x80; + break; + + case 9: + w0[2] = w0[2] | 0x8000; + break; + + case 10: + w0[2] = w0[2] | 0x800000; + break; + + case 11: + w0[2] = w0[2] | 0x80000000; + break; + + case 12: + w0[3] = 0x80; + break; + + case 13: + w0[3] = w0[3] | 0x8000; + break; + + case 14: + w0[3] = w0[3] | 0x800000; + break; + + case 15: + w0[3] = w0[3] | 0x80000000; + break; + + case 16: + w1[0] = 0x80; + break; + + case 17: + w1[0] = w1[0] | 0x8000; + break; + + case 18: + w1[0] = w1[0] | 0x800000; + break; + + case 19: + w1[0] = w1[0] | 0x80000000; + break; + + case 20: + w1[1] = 0x80; + break; + + case 21: + w1[1] = w1[1] | 0x8000; + break; + + case 22: + w1[1] = w1[1] | 0x800000; + break; + + case 23: + w1[1] = w1[1] | 0x80000000; + break; + + case 24: + w1[2] = 0x80; + break; + + case 25: + w1[2] = w1[2] | 0x8000; + break; + + case 26: + w1[2] = w1[2] | 0x800000; + break; + + case 27: + w1[2] = w1[2] | 0x80000000; + break; + + case 28: + w1[3] = 0x80; + break; + + case 29: + w1[3] = w1[3] | 0x8000; + break; + + case 30: + w1[3] = w1[3] | 0x800000; + break; + + case 31: + w1[3] = w1[3] | 0x80000000; + break; + + case 32: + w2[0] = 0x80; + break; + + case 33: + w2[0] = w2[0] | 0x8000; + break; + + case 34: + w2[0] = w2[0] | 0x800000; + break; + + case 35: + w2[0] = w2[0] | 0x80000000; + break; + + case 36: + w2[1] = 0x80; + break; + + case 37: + w2[1] = w2[1] | 0x8000; + break; + + case 38: + w2[1] = w2[1] | 0x800000; + break; + + case 39: + w2[1] = w2[1] | 0x80000000; + break; + + case 40: + w2[2] = 0x80; + break; + + case 41: + w2[2] = w2[2] | 0x8000; + break; + + case 42: + w2[2] = w2[2] | 0x800000; + break; + + case 43: + w2[2] = w2[2] | 0x80000000; + break; + + case 44: + w2[3] = 0x80; + break; + + case 45: + w2[3] = w2[3] | 0x8000; + break; + + case 46: + w2[3] = w2[3] | 0x800000; + break; + + case 47: + w2[3] = w2[3] | 0x80000000; break; case 48: - w[12] = 0x80; + w3[0] = 0x80; break; case 49: - w[12] = w[12] | 0x8000; + w3[0] = w3[0] | 0x8000; break; case 50: - w[12] = w[12] | 0x800000; + w3[0] = w3[0] | 0x800000; break; case 51: - w[12] = w[12] | 0x80000000; + w3[0] = w3[0] | 0x80000000; break; case 52: - w[13] = 0x80; + w3[1] = 0x80; break; case 53: - w[13] = w[13] | 0x8000; + w3[1] = w3[1] | 0x8000; break; case 54: - w[13] = w[13] | 0x800000; + w3[1] = w3[1] | 0x800000; break; case 55: - w[13] = w[13] | 0x80000000; + w3[1] = w3[1] | 0x80000000; break; case 56: - w[14] = 0x80; + w3[2] = 0x80; break; case 57: - w[14] = w[14] | 0x8000; + w3[2] = w3[2] | 0x8000; break; case 58: - w[14] = w[14] | 0x800000; + w3[2] = w3[2] | 0x800000; break; case 59: - w[14] = w[14] | 0x80000000; + w3[2] = w3[2] | 0x80000000; break; case 60: - w[15] = 0x80; + w3[3] = 0x80; break; case 61: - w[15] = w[15] | 0x8000; + w3[3] = w3[3] | 0x8000; break; case 62: - w[15] = w[15] | 0x800000; + w3[3] = w3[3] | 0x800000; break; case 63: - w[15] = w[15] | 0x80000000; - break; - - case 64: - w[16] = 0x80; - break; - - case 65: - w[16] = w[16] | 0x8000; - break; - - case 66: - w[16] = w[16] | 0x800000; - break; - - case 67: - w[16] = w[16] | 0x80000000; - break; - - case 68: - w[17] = 0x80; - break; - - case 69: - w[17] = w[17] | 0x8000; - break; - - case 70: - w[17] = w[17] | 0x800000; - break; - - case 71: - w[17] = w[17] | 0x80000000; - break; - - case 72: - w[18] = 0x80; - break; - - case 73: - w[18] = w[18] | 0x8000; - break; - - case 74: - w[18] = w[18] | 0x800000; - break; - - case 75: - w[18] = w[18] | 0x80000000; - break; - - case 76: - w[19] = 0x80; - break; - - case 77: - w[19] = w[19] | 0x8000; - break; - - case 78: - w[19] = w[19] | 0x800000; - break; - - case 79: - w[19] = w[19] | 0x80000000; - break; - - case 80: - w[20] = 0x80; - break; - - case 81: - w[20] = w[20] | 0x8000; - break; - - case 82: - w[20] = w[20] | 0x800000; - break; - - case 83: - w[20] = w[20] | 0x80000000; - break; - - case 84: - w[21] = 0x80; - break; - - case 85: - w[21] = w[21] | 0x8000; - break; - - case 86: - w[21] = w[21] | 0x800000; - break; - - case 87: - w[21] = w[21] | 0x80000000; - break; - - case 88: - w[22] = 0x80; - break; - - case 89: - w[22] = w[22] | 0x8000; - break; - - case 90: - w[22] = w[22] | 0x800000; - break; - - case 91: - w[22] = w[22] | 0x80000000; - break; - - case 92: - w[23] = 0x80; - break; - - case 93: - w[23] = w[23] | 0x8000; - break; - - case 94: - w[23] = w[23] | 0x800000; - break; - - case 95: - w[23] = w[23] | 0x80000000; - break; - - case 96: - w[24] = 0x80; - break; - - case 97: - w[24] = w[24] | 0x8000; - break; - - case 98: - w[24] = w[24] | 0x800000; - break; - - case 99: - w[24] = w[24] | 0x80000000; - break; - - case 100: - w[25] = 0x80; - break; - - case 101: - w[25] = w[25] | 0x8000; - break; - - case 102: - w[25] = w[25] | 0x800000; - break; - - case 103: - w[25] = w[25] | 0x80000000; - break; - - case 104: - w[26] = 0x80; - break; - - case 105: - w[26] = w[26] | 0x8000; - break; - - case 106: - w[26] = w[26] | 0x800000; - break; - - case 107: - w[26] = w[26] | 0x80000000; - break; - - case 108: - w[27] = 0x80; - break; - - case 109: - w[27] = w[27] | 0x8000; - break; - - case 110: - w[27] = w[27] | 0x800000; - break; - - case 111: - w[27] = w[27] | 0x80000000; - break; - - case 112: - w[28] = 0x80; - break; - - case 113: - w[28] = w[28] | 0x8000; - break; - - case 114: - w[28] = w[28] | 0x800000; - break; - - case 115: - w[28] = w[28] | 0x80000000; - break; - - case 116: - w[29] = 0x80; - break; - - case 117: - w[29] = w[29] | 0x8000; - break; - - case 118: - w[29] = w[29] | 0x800000; - break; - - case 119: - w[29] = w[29] | 0x80000000; - break; - - case 120: - w[30] = 0x80; - break; - - case 121: - w[30] = w[30] | 0x8000; - break; - - case 122: - w[30] = w[30] | 0x800000; - break; - - case 123: - w[30] = w[30] | 0x80000000; - break; - - case 124: - w[31] = 0x80; - break; - - case 125: - w[31] = w[31] | 0x8000; - break; - - case 126: - w[31] = w[31] | 0x800000; - break; - - case 127: - w[31] = w[31] | 0x80000000; + w3[3] = w3[3] | 0x80000000; break; } } -// before: device_memcat2L -static void memcat_c7_d1x2_sl1x2_sr1x2 (const u32 offset, u32 dst0[2], u32 src_l0[2], u32 src_r0[2]) +static void truncate_block_S (u32 w[4], const u32 len) { - switch (offset) + switch (len) { - case 1: - dst0[0] = src_l0[0] | src_r0[0] << 8; - dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8; - break; - - case 2: - dst0[0] = src_l0[0] | src_r0[0] << 16; - dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16; - break; - - case 3: - dst0[0] = src_l0[0] | src_r0[0] << 24; - dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24; - break; - - case 4: - dst0[1] = src_r0[0]; - break; - - case 5: - dst0[1] = src_l0[1] | src_r0[0] << 8; - break; - - case 6: - dst0[1] = src_l0[1] | src_r0[0] << 16; - break; - - case 7: - dst0[1] = src_l0[1] | src_r0[0] << 24; - break; + case 0: w[0] &= 0; + w[1] &= 0; + w[2] &= 0; + w[3] &= 0; + break; + case 1: w[0] &= 0x000000FF; + w[1] &= 0; + w[2] &= 0; + w[3] &= 0; + break; + case 2: w[0] &= 0x0000FFFF; + w[1] &= 0; + w[2] &= 0; + w[3] &= 0; + break; + case 3: w[0] &= 0x00FFFFFF; + w[1] &= 0; + w[2] &= 0; + w[3] &= 0; + break; + case 4: w[1] &= 0; + w[2] &= 0; + w[3] &= 0; + break; + case 5: w[1] &= 0x000000FF; + w[2] &= 0; + w[3] &= 0; + break; + case 6: w[1] &= 0x0000FFFF; + w[2] &= 0; + w[3] &= 0; + break; + case 7: w[1] &= 0x00FFFFFF; + w[2] &= 0; + w[3] &= 0; + break; + case 8: w[2] &= 0; + w[3] &= 0; + break; + case 9: w[2] &= 0x000000FF; + w[3] &= 0; + break; + case 10: w[2] &= 0x0000FFFF; + w[3] &= 0; + break; + case 11: w[2] &= 0x00FFFFFF; + w[3] &= 0; + break; + case 12: w[3] &= 0; + break; + case 13: w[3] &= 0x000000FF; + break; + case 14: w[3] &= 0x0000FFFF; + break; + case 15: w[3] &= 0x00FFFFFF; + break; } } -// before: device_memcat4L -static void memcat_c15_d1x4_sl1x4_sr1x4 (const u32 offset, u32 dst0[4], u32 src_l0[4], u32 src_r0[4]) +static void make_unicode_S (const u32 in[4], u32 out1[4], u32 out2[4]) { - switch (offset) - { - case 1: - dst0[0] = src_l0[0] | src_r0[0] << 8; - dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8; - dst0[2] = src_r0[1] >> 24 | src_r0[2] << 8; - dst0[3] = src_r0[2] >> 24 | src_r0[3] << 8; - break; + #ifdef IS_NV + out2[3] = __byte_perm_S (in[3], 0, 0x7372); + out2[2] = __byte_perm_S (in[3], 0, 0x7170); + out2[1] = __byte_perm_S (in[2], 0, 0x7372); + out2[0] = __byte_perm_S (in[2], 0, 0x7170); + out1[3] = __byte_perm_S (in[1], 0, 0x7372); + out1[2] = __byte_perm_S (in[1], 0, 0x7170); + out1[1] = __byte_perm_S (in[0], 0, 0x7372); + out1[0] = __byte_perm_S (in[0], 0, 0x7170); + #endif - case 2: - dst0[0] = src_l0[0] | src_r0[0] << 16; - dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16; - dst0[2] = src_r0[1] >> 16 | src_r0[2] << 16; - dst0[3] = src_r0[2] >> 16 | src_r0[3] << 16; - break; - - case 3: - dst0[0] = src_l0[0] | src_r0[0] << 24; - dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24; - dst0[2] = src_r0[1] >> 8 | src_r0[2] << 24; - dst0[3] = src_r0[2] >> 8 | src_r0[3] << 24; - break; - - case 4: - dst0[1] = src_r0[0]; - dst0[2] = src_r0[1]; - dst0[3] = src_r0[2]; - break; - - case 5: - dst0[1] = src_l0[1] | src_r0[0] << 8; - dst0[2] = src_r0[0] >> 24 | src_r0[1] << 8; - dst0[3] = src_r0[1] >> 24 | src_r0[2] << 8; - break; - - case 6: - dst0[1] = src_l0[1] | src_r0[0] << 16; - dst0[2] = src_r0[0] >> 16 | src_r0[1] << 16; - dst0[3] = src_r0[1] >> 16 | src_r0[2] << 16; - break; - - case 7: - dst0[1] = src_l0[1] | src_r0[0] << 24; - dst0[2] = src_r0[0] >> 8 | src_r0[1] << 24; - dst0[3] = src_r0[1] >> 8 | src_r0[2] << 24; - break; - - case 8: - dst0[2] = src_r0[0]; - dst0[3] = src_r0[1]; - break; - - case 9: - dst0[2] = src_l0[2] | src_r0[0] << 8; - dst0[3] = src_r0[0] >> 24 | src_r0[1] << 8; - break; - - case 10: - dst0[2] = src_l0[2] | src_r0[0] << 16; - dst0[3] = src_r0[0] >> 16 | src_r0[1] << 16; - break; - - case 11: - dst0[2] = src_l0[2] | src_r0[0] << 24; - dst0[3] = src_r0[0] >> 8 | src_r0[1] << 24; - break; - - case 12: - dst0[3] = src_r0[0]; - break; - - case 13: - dst0[3] = src_l0[3] | src_r0[0] << 8; - break; - - case 14: - dst0[3] = src_l0[3] | src_r0[0] << 16; - break; - - case 15: - dst0[3] = src_l0[3] | src_r0[0] << 24; - break; - } + #if defined IS_AMD || defined IS_GENERIC + out2[3] = ((in[3] >> 8) & 0x00FF0000) | ((in[3] >> 16) & 0x000000FF); + out2[2] = ((in[3] << 8) & 0x00FF0000) | ((in[3] >> 0) & 0x000000FF); + out2[1] = ((in[2] >> 8) & 0x00FF0000) | ((in[2] >> 16) & 0x000000FF); + out2[0] = ((in[2] << 8) & 0x00FF0000) | ((in[2] >> 0) & 0x000000FF); + out1[3] = ((in[1] >> 8) & 0x00FF0000) | ((in[1] >> 16) & 0x000000FF); + out1[2] = ((in[1] << 8) & 0x00FF0000) | ((in[1] >> 0) & 0x000000FF); + out1[1] = ((in[0] >> 8) & 0x00FF0000) | ((in[0] >> 16) & 0x000000FF); + out1[0] = ((in[0] << 8) & 0x00FF0000) | ((in[0] >> 0) & 0x000000FF); + #endif } -// before: device_memcat8L -static void memcat_c31_d2x4_sl2x4_sr1x4 (const u32 offset, u32 dst0[4], u32 dst1[4], u32 src_l0[4], u32 src_l1[4], u32 src_r0[4]) +static void undo_unicode_S (const u32 in1[4], const u32 in2[4], u32 out[4]) { - switch (offset) - { - case 1: - dst0[0] = src_l0[0] | src_r0[0] << 8; - dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8; - dst0[2] = src_r0[1] >> 24 | src_r0[2] << 8; - dst0[3] = src_r0[2] >> 24 | src_r0[3] << 8; - dst1[0] = src_r0[3] >> 24; - break; + #ifdef IS_NV + out[0] = __byte_perm_S (in1[0], in1[1], 0x6420); + out[1] = __byte_perm_S (in1[2], in1[3], 0x6420); + out[2] = __byte_perm_S (in2[0], in2[1], 0x6420); + out[3] = __byte_perm_S (in2[2], in2[3], 0x6420); + #endif - case 2: - dst0[0] = src_l0[0] | src_r0[0] << 16; - dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16; - dst0[2] = src_r0[1] >> 16 | src_r0[2] << 16; - dst0[3] = src_r0[2] >> 16 | src_r0[3] << 16; - dst1[0] = src_r0[3] >> 16; - break; - - case 3: - dst0[0] = src_l0[0] | src_r0[0] << 24; - dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24; - dst0[2] = src_r0[1] >> 8 | src_r0[2] << 24; - dst0[3] = src_r0[2] >> 8 | src_r0[3] << 24; - dst1[0] = src_r0[3] >> 8; - break; - - case 4: - dst0[1] = src_r0[0]; - dst0[2] = src_r0[1]; - dst0[3] = src_r0[2]; - dst1[0] = src_r0[3]; - break; - - case 5: - dst0[1] = src_l0[1] | src_r0[0] << 8; - dst0[2] = src_r0[0] >> 24 | src_r0[1] << 8; - dst0[3] = src_r0[1] >> 24 | src_r0[2] << 8; - dst1[0] = src_r0[2] >> 24 | src_r0[3] << 8; - dst1[1] = src_r0[3] >> 24; - break; - - case 6: - dst0[1] = src_l0[1] | src_r0[0] << 16; - dst0[2] = src_r0[0] >> 16 | src_r0[1] << 16; - dst0[3] = src_r0[1] >> 16 | src_r0[2] << 16; - dst1[0] = src_r0[2] >> 16 | src_r0[3] << 16; - dst1[1] = src_r0[3] >> 16; - break; - - case 7: - dst0[1] = src_l0[1] | src_r0[0] << 24; - dst0[2] = src_r0[0] >> 8 | src_r0[1] << 24; - dst0[3] = src_r0[1] >> 8 | src_r0[2] << 24; - dst1[0] = src_r0[2] >> 8 | src_r0[3] << 24; - dst1[1] = src_r0[3] >> 8; - break; - - case 8: - dst0[2] = src_r0[0]; - dst0[3] = src_r0[1]; - dst1[0] = src_r0[2]; - dst1[1] = src_r0[3]; - break; - - case 9: - dst0[2] = src_l0[2] | src_r0[0] << 8; - dst0[3] = src_r0[0] >> 24 | src_r0[1] << 8; - dst1[0] = src_r0[1] >> 24 | src_r0[2] << 8; - dst1[1] = src_r0[2] >> 24 | src_r0[3] << 8; - dst1[2] = src_r0[3] >> 24; - break; - - case 10: - dst0[2] = src_l0[2] | src_r0[0] << 16; - dst0[3] = src_r0[0] >> 16 | src_r0[1] << 16; - dst1[0] = src_r0[1] >> 16 | src_r0[2] << 16; - dst1[1] = src_r0[2] >> 16 | src_r0[3] << 16; - dst1[2] = src_r0[3] >> 16; - break; - - case 11: - dst0[2] = src_l0[2] | src_r0[0] << 24; - dst0[3] = src_r0[0] >> 8 | src_r0[1] << 24; - dst1[0] = src_r0[1] >> 8 | src_r0[2] << 24; - dst1[1] = src_r0[2] >> 8 | src_r0[3] << 24; - dst1[2] = src_r0[3] >> 8; - break; - - case 12: - dst0[3] = src_r0[0]; - dst1[0] = src_r0[1]; - dst1[1] = src_r0[2]; - dst1[2] = src_r0[3]; - break; - - case 13: - dst0[3] = src_l0[3] | src_r0[0] << 8; - dst1[0] = src_r0[0] >> 24 | src_r0[1] << 8; - dst1[1] = src_r0[1] >> 24 | src_r0[2] << 8; - dst1[2] = src_r0[2] >> 24 | src_r0[3] << 8; - dst1[3] = src_r0[3] >> 24; - break; - - case 14: - dst0[3] = src_l0[3] | src_r0[0] << 16; - dst1[0] = src_r0[0] >> 16 | src_r0[1] << 16; - dst1[1] = src_r0[1] >> 16 | src_r0[2] << 16; - dst1[2] = src_r0[2] >> 16 | src_r0[3] << 16; - dst1[3] = src_r0[3] >> 16; - break; - - case 15: - dst0[3] = src_l0[3] | src_r0[0] << 24; - dst1[0] = src_r0[0] >> 8 | src_r0[1] << 24; - dst1[1] = src_r0[1] >> 8 | src_r0[2] << 24; - dst1[2] = src_r0[2] >> 8 | src_r0[3] << 24; - dst1[3] = src_r0[3] >> 8; - break; - - case 16: - dst1[0] = src_r0[0]; - dst1[1] = src_r0[1]; - dst1[2] = src_r0[2]; - dst1[3] = src_r0[3]; - break; - - case 17: - dst1[0] = src_l1[0] | src_r0[0] << 8; - dst1[1] = src_r0[0] >> 24 | src_r0[1] << 8; - dst1[2] = src_r0[1] >> 24 | src_r0[2] << 8; - dst1[3] = src_r0[2] >> 24 | src_r0[3] << 8; - break; - - case 18: - dst1[0] = src_l1[0] | src_r0[0] << 16; - dst1[1] = src_r0[0] >> 16 | src_r0[1] << 16; - dst1[2] = src_r0[1] >> 16 | src_r0[2] << 16; - dst1[3] = src_r0[2] >> 16 | src_r0[3] << 16; - break; - - case 19: - dst1[0] = src_l1[0] | src_r0[0] << 24; - dst1[1] = src_r0[0] >> 8 | src_r0[1] << 24; - dst1[2] = src_r0[1] >> 8 | src_r0[2] << 24; - dst1[3] = src_r0[2] >> 8 | src_r0[3] << 24; - break; - - case 20: - dst1[1] = src_r0[0]; - dst1[2] = src_r0[1]; - dst1[3] = src_r0[2]; - break; - - case 21: - dst1[1] = src_l1[1] | src_r0[0] << 8; - dst1[2] = src_r0[0] >> 24 | src_r0[1] << 8; - dst1[3] = src_r0[1] >> 24 | src_r0[2] << 8; - break; - - case 22: - dst1[1] = src_l1[1] | src_r0[0] << 16; - dst1[2] = src_r0[0] >> 16 | src_r0[1] << 16; - dst1[3] = src_r0[1] >> 16 | src_r0[2] << 16; - break; - - case 23: - dst1[1] = src_l1[1] | src_r0[0] << 24; - dst1[2] = src_r0[0] >> 8 | src_r0[1] << 24; - dst1[3] = src_r0[1] >> 8 | src_r0[2] << 24; - break; - - case 24: - dst1[2] = src_r0[0]; - dst1[3] = src_r0[1]; - break; - - case 25: - dst1[2] = src_l1[2] | src_r0[0] << 8; - dst1[3] = src_r0[0] >> 24 | src_r0[1] << 8; - break; - - case 26: - dst1[2] = src_l1[2] | src_r0[0] << 16; - dst1[3] = src_r0[0] >> 16 | src_r0[1] << 16; - break; - - case 27: - dst1[2] = src_l1[2] | src_r0[0] << 24; - dst1[3] = src_r0[0] >> 8 | src_r0[1] << 24; - break; - - case 28: - dst1[3] = src_r0[0]; - break; - - case 29: - dst1[3] = src_l1[3] | src_r0[0] << 8; - break; - - case 30: - dst1[3] = src_l1[3] | src_r0[0] << 16; - break; - - case 31: - dst1[3] = src_l1[3] | src_r0[0] << 24; - break; - } + #if defined IS_AMD || defined IS_GENERIC + out[0] = ((in1[0] & 0x000000ff) >> 0) | ((in1[0] & 0x00ff0000) >> 8) + | ((in1[1] & 0x000000ff) << 16) | ((in1[1] & 0x00ff0000) << 8); + out[1] = ((in1[2] & 0x000000ff) >> 0) | ((in1[2] & 0x00ff0000) >> 8) + | ((in1[3] & 0x000000ff) << 16) | ((in1[3] & 0x00ff0000) << 8); + out[2] = ((in2[0] & 0x000000ff) >> 0) | ((in2[0] & 0x00ff0000) >> 8) + | ((in2[1] & 0x000000ff) << 16) | ((in2[1] & 0x00ff0000) << 8); + out[3] = ((in2[2] & 0x000000ff) >> 0) | ((in2[2] & 0x00ff0000) >> 8) + | ((in2[3] & 0x000000ff) << 16) | ((in2[3] & 0x00ff0000) << 8); + #endif } -// before: device_memcat12L -static void memcat_c47_d3x4_sl3x4_sr1x4 (const u32 offset, u32 dst0[4], u32 dst1[4], u32 dst2[4], u32 src_l0[4], u32 src_l1[4], u32 src_l2[4], u32 src_r0[4]) +static void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) { - switch (offset) - { - case 1: - dst0[0] = src_l0[0] | src_r0[0] << 8; - dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8; - dst0[2] = src_r0[1] >> 24 | src_r0[2] << 8; - dst0[3] = src_r0[2] >> 24 | src_r0[3] << 8; - dst1[0] = src_r0[3] >> 24; - break; + #if defined IS_AMD || defined IS_GENERIC + const int offset_mod_4 = offset & 3; - case 2: - dst0[0] = src_l0[0] | src_r0[0] << 16; - dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16; - dst0[2] = src_r0[1] >> 16 | src_r0[2] << 16; - dst0[3] = src_r0[2] >> 16 | src_r0[3] << 16; - dst1[0] = src_r0[3] >> 16; - break; + const int offset_minus_4 = 4 - offset; - case 3: - dst0[0] = src_l0[0] | src_r0[0] << 24; - dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24; - dst0[2] = src_r0[1] >> 8 | src_r0[2] << 24; - dst0[3] = src_r0[2] >> 8 | src_r0[3] << 24; - dst1[0] = src_r0[3] >> 8; - break; - - case 4: - dst0[1] = src_r0[0]; - dst0[2] = src_r0[1]; - dst0[3] = src_r0[2]; - dst1[0] = src_r0[3]; - break; - - case 5: - dst0[1] = src_l0[1] | src_r0[0] << 8; - dst0[2] = src_r0[0] >> 24 | src_r0[1] << 8; - dst0[3] = src_r0[1] >> 24 | src_r0[2] << 8; - dst1[0] = src_r0[2] >> 24 | src_r0[3] << 8; - dst1[1] = src_r0[3] >> 24; - break; - - case 6: - dst0[1] = src_l0[1] | src_r0[0] << 16; - dst0[2] = src_r0[0] >> 16 | src_r0[1] << 16; - dst0[3] = src_r0[1] >> 16 | src_r0[2] << 16; - dst1[0] = src_r0[2] >> 16 | src_r0[3] << 16; - dst1[1] = src_r0[3] >> 16; - break; - - case 7: - dst0[1] = src_l0[1] | src_r0[0] << 24; - dst0[2] = src_r0[0] >> 8 | src_r0[1] << 24; - dst0[3] = src_r0[1] >> 8 | src_r0[2] << 24; - dst1[0] = src_r0[2] >> 8 | src_r0[3] << 24; - dst1[1] = src_r0[3] >> 8; - break; - - case 8: - dst0[2] = src_r0[0]; - dst0[3] = src_r0[1]; - dst1[0] = src_r0[2]; - dst1[1] = src_r0[3]; - break; - - case 9: - dst0[2] = src_l0[2] | src_r0[0] << 8; - dst0[3] = src_r0[0] >> 24 | src_r0[1] << 8; - dst1[0] = src_r0[1] >> 24 | src_r0[2] << 8; - dst1[1] = src_r0[2] >> 24 | src_r0[3] << 8; - dst1[2] = src_r0[3] >> 24; - break; - - case 10: - dst0[2] = src_l0[2] | src_r0[0] << 16; - dst0[3] = src_r0[0] >> 16 | src_r0[1] << 16; - dst1[0] = src_r0[1] >> 16 | src_r0[2] << 16; - dst1[1] = src_r0[2] >> 16 | src_r0[3] << 16; - dst1[2] = src_r0[3] >> 16; - break; - - case 11: - dst0[2] = src_l0[2] | src_r0[0] << 24; - dst0[3] = src_r0[0] >> 8 | src_r0[1] << 24; - dst1[0] = src_r0[1] >> 8 | src_r0[2] << 24; - dst1[1] = src_r0[2] >> 8 | src_r0[3] << 24; - dst1[2] = src_r0[3] >> 8; - break; - - case 12: - dst0[3] = src_r0[0]; - dst1[0] = src_r0[1]; - dst1[1] = src_r0[2]; - dst1[2] = src_r0[3]; - break; - - case 13: - dst0[3] = src_l0[3] | src_r0[0] << 8; - dst1[0] = src_r0[0] >> 24 | src_r0[1] << 8; - dst1[1] = src_r0[1] >> 24 | src_r0[2] << 8; - dst1[2] = src_r0[2] >> 24 | src_r0[3] << 8; - dst1[3] = src_r0[3] >> 24; - break; - - case 14: - dst0[3] = src_l0[3] | src_r0[0] << 16; - dst1[0] = src_r0[0] >> 16 | src_r0[1] << 16; - dst1[1] = src_r0[1] >> 16 | src_r0[2] << 16; - dst1[2] = src_r0[2] >> 16 | src_r0[3] << 16; - dst1[3] = src_r0[3] >> 16; - break; - - case 15: - dst0[3] = src_l0[3] | src_r0[0] << 24; - dst1[0] = src_r0[0] >> 8 | src_r0[1] << 24; - dst1[1] = src_r0[1] >> 8 | src_r0[2] << 24; - dst1[2] = src_r0[2] >> 8 | src_r0[3] << 24; - dst1[3] = src_r0[3] >> 8; - break; - - case 16: - dst1[0] = src_r0[0]; - dst1[1] = src_r0[1]; - dst1[2] = src_r0[2]; - dst1[3] = src_r0[3]; - break; - - case 17: - dst1[0] = src_l1[0] | src_r0[0] << 8; - dst1[1] = src_r0[0] >> 24 | src_r0[1] << 8; - dst1[2] = src_r0[1] >> 24 | src_r0[2] << 8; - dst1[3] = src_r0[2] >> 24 | src_r0[3] << 8; - dst2[0] = src_r0[3] >> 24; - break; - - case 18: - dst1[0] = src_l1[0] | src_r0[0] << 16; - dst1[1] = src_r0[0] >> 16 | src_r0[1] << 16; - dst1[2] = src_r0[1] >> 16 | src_r0[2] << 16; - dst1[3] = src_r0[2] >> 16 | src_r0[3] << 16; - dst2[0] = src_r0[3] >> 16; - break; - - case 19: - dst1[0] = src_l1[0] | src_r0[0] << 24; - dst1[1] = src_r0[0] >> 8 | src_r0[1] << 24; - dst1[2] = src_r0[1] >> 8 | src_r0[2] << 24; - dst1[3] = src_r0[2] >> 8 | src_r0[3] << 24; - dst2[0] = src_r0[3] >> 8; - break; - - case 20: - dst1[1] = src_r0[0]; - dst1[2] = src_r0[1]; - dst1[3] = src_r0[2]; - dst2[0] = src_r0[3]; - break; - - case 21: - dst1[1] = src_l1[1] | src_r0[0] << 8; - dst1[2] = src_r0[0] >> 24 | src_r0[1] << 8; - dst1[3] = src_r0[1] >> 24 | src_r0[2] << 8; - dst2[0] = src_r0[2] >> 24 | src_r0[3] << 8; - dst2[1] = src_r0[3] >> 24; - break; - - case 22: - dst1[1] = src_l1[1] | src_r0[0] << 16; - dst1[2] = src_r0[0] >> 16 | src_r0[1] << 16; - dst1[3] = src_r0[1] >> 16 | src_r0[2] << 16; - dst2[0] = src_r0[2] >> 16 | src_r0[3] << 16; - dst2[1] = src_r0[3] >> 16; - break; - - case 23: - dst1[1] = src_l1[1] | src_r0[0] << 24; - dst1[2] = src_r0[0] >> 8 | src_r0[1] << 24; - dst1[3] = src_r0[1] >> 8 | src_r0[2] << 24; - dst2[0] = src_r0[2] >> 8 | src_r0[3] << 24; - dst2[1] = src_r0[3] >> 8; - break; - - case 24: - dst1[2] = src_r0[0]; - dst1[3] = src_r0[1]; - dst2[0] = src_r0[2]; - dst2[1] = src_r0[3]; - break; - - case 25: - dst1[2] = src_l1[2] | src_r0[0] << 8; - dst1[3] = src_r0[0] >> 24 | src_r0[1] << 8; - dst2[0] = src_r0[1] >> 24 | src_r0[2] << 8; - dst2[1] = src_r0[2] >> 24 | src_r0[3] << 8; - dst2[2] = src_r0[3] >> 24; - break; - - case 26: - dst1[2] = src_l1[2] | src_r0[0] << 16; - dst1[3] = src_r0[0] >> 16 | src_r0[1] << 16; - dst2[0] = src_r0[1] >> 16 | src_r0[2] << 16; - dst2[1] = src_r0[2] >> 16 | src_r0[3] << 16; - dst2[2] = src_r0[3] >> 16; - break; - - case 27: - dst1[2] = src_l1[2] | src_r0[0] << 24; - dst1[3] = src_r0[0] >> 8 | src_r0[1] << 24; - dst2[0] = src_r0[1] >> 8 | src_r0[2] << 24; - dst2[1] = src_r0[2] >> 8 | src_r0[3] << 24; - dst2[2] = src_r0[3] >> 8; - break; - - case 28: - dst1[3] = src_r0[0]; - dst2[0] = src_r0[1]; - dst2[1] = src_r0[2]; - dst2[2] = src_r0[3]; - break; - - case 29: - dst1[3] = src_l1[3] | src_r0[0] << 8; - dst2[0] = src_r0[0] >> 24 | src_r0[1] << 8; - dst2[1] = src_r0[1] >> 24 | src_r0[2] << 8; - dst2[2] = src_r0[2] >> 24 | src_r0[3] << 8; - dst2[3] = src_r0[3] >> 24; - break; - - case 30: - dst1[3] = src_l1[3] | src_r0[0] << 16; - dst2[0] = src_r0[0] >> 16 | src_r0[1] << 16; - dst2[1] = src_r0[1] >> 16 | src_r0[2] << 16; - dst2[2] = src_r0[2] >> 16 | src_r0[3] << 16; - dst2[3] = src_r0[3] >> 16; - break; - - case 31: - dst1[3] = src_l1[3] | src_r0[0] << 24; - dst2[0] = src_r0[0] >> 8 | src_r0[1] << 24; - dst2[1] = src_r0[1] >> 8 | src_r0[2] << 24; - dst2[2] = src_r0[2] >> 8 | src_r0[3] << 24; - dst2[3] = src_r0[3] >> 8; - break; - - case 32: - dst2[0] = src_r0[0]; - dst2[1] = src_r0[1]; - dst2[2] = src_r0[2]; - dst2[3] = src_r0[3]; - break; - - case 33: - dst2[0] = src_l2[0] | src_r0[0] << 8; - dst2[1] = src_r0[0] >> 24 | src_r0[1] << 8; - dst2[2] = src_r0[1] >> 24 | src_r0[2] << 8; - dst2[3] = src_r0[2] >> 24 | src_r0[3] << 8; - break; - - case 34: - dst2[0] = src_l2[0] | src_r0[0] << 16; - dst2[1] = src_r0[0] >> 16 | src_r0[1] << 16; - dst2[2] = src_r0[1] >> 16 | src_r0[2] << 16; - dst2[3] = src_r0[2] >> 16 | src_r0[3] << 16; - break; - - case 35: - dst2[0] = src_l2[0] | src_r0[0] << 24; - dst2[1] = src_r0[0] >> 8 | src_r0[1] << 24; - dst2[2] = src_r0[1] >> 8 | src_r0[2] << 24; - dst2[3] = src_r0[2] >> 8 | src_r0[3] << 24; - break; - - case 36: - dst2[1] = src_r0[0]; - dst2[2] = src_r0[1]; - dst2[3] = src_r0[2]; - break; - - case 37: - dst2[1] = src_l2[1] | src_r0[0] << 8; - dst2[2] = src_r0[0] >> 24 | src_r0[1] << 8; - dst2[3] = src_r0[1] >> 24 | src_r0[2] << 8; - break; - - case 38: - dst2[1] = src_l2[1] | src_r0[0] << 16; - dst2[2] = src_r0[0] >> 16 | src_r0[1] << 16; - dst2[3] = src_r0[1] >> 16 | src_r0[2] << 16; - break; - - case 39: - dst2[1] = src_l2[1] | src_r0[0] << 24; - dst2[2] = src_r0[0] >> 8 | src_r0[1] << 24; - dst2[3] = src_r0[1] >> 8 | src_r0[2] << 24; - break; - - case 40: - dst2[2] = src_r0[0]; - dst2[3] = src_r0[1]; - break; - - case 41: - dst2[2] = src_l2[2] | src_r0[0] << 8; - dst2[3] = src_r0[0] >> 24 | src_r0[1] << 8; - break; - - case 42: - dst2[2] = src_l2[2] | src_r0[0] << 16; - dst2[3] = src_r0[0] >> 16 | src_r0[1] << 16; - break; - - case 43: - dst2[2] = src_l2[2] | src_r0[0] << 24; - dst2[3] = src_r0[0] >> 8 | src_r0[1] << 24; - break; - - case 44: - dst2[3] = src_r0[0]; - break; - - case 45: - dst2[3] = src_l2[3] | src_r0[0] << 8; - break; - - case 46: - dst2[3] = src_l2[3] | src_r0[0] << 16; - break; - - case 47: - dst2[3] = src_l2[3] | src_r0[0] << 24; - break; - } -} - -// before: device_memcat12L -static void memcat_c47_d3x4_sl3x4_sr2x4 (const u32 offset, u32 dst0[4], u32 dst1[4], u32 dst2[4], u32 src_l0[4], u32 src_l1[4], u32 src_l2[4], u32 src_r0[4], u32 src_r1[4]) -{ - switch (offset) + switch (offset / 4) { case 0: - dst0[0] = src_r0[0]; - dst0[1] = src_r0[1]; - dst0[2] = src_r0[2]; - dst0[3] = src_r0[3]; - dst1[0] = src_r1[0]; - dst1[1] = src_r1[1]; - dst1[2] = src_r1[2]; - dst1[3] = src_r1[3]; + w3[2] = amd_bytealign_S ( 0, w3[1], offset_minus_4); + w3[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w3[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w2[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w2[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w2[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w2[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w1[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w1[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w1[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w1[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w0[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w0[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w0[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w0[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + + if (offset_mod_4 == 0) + { + w0[0] = w0[1]; + w0[1] = w0[2]; + w0[2] = w0[3]; + w0[3] = w1[0]; + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = 0; + } + break; case 1: - dst0[0] = src_l0[0] | src_r0[0] << 8; - dst0[1] = src_r0[0] >> 24 | src_r0[1] << 8; - dst0[2] = src_r0[1] >> 24 | src_r0[2] << 8; - dst0[3] = src_r0[2] >> 24 | src_r0[3] << 8; - dst1[0] = src_r0[3] >> 24 | src_r1[0] << 8; - dst1[1] = src_r1[0] >> 24 | src_r1[1] << 8; - dst1[2] = src_r1[1] >> 24 | src_r1[2] << 8; - dst1[3] = src_r1[2] >> 24 | src_r1[3] << 8; - dst2[0] = src_r1[3] >> 24; + w3[2] = amd_bytealign_S ( 0, w3[0], offset_minus_4); + w3[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w3[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w2[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w2[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w2[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w2[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w1[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w1[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w1[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w1[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w0[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w0[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w0[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w0[1] = w0[2]; + w0[2] = w0[3]; + w0[3] = w1[0]; + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = 0; + } + break; case 2: - dst0[0] = src_l0[0] | src_r0[0] << 16; - dst0[1] = src_r0[0] >> 16 | src_r0[1] << 16; - dst0[2] = src_r0[1] >> 16 | src_r0[2] << 16; - dst0[3] = src_r0[2] >> 16 | src_r0[3] << 16; - dst1[0] = src_r0[3] >> 16 | src_r1[0] << 16; - dst1[1] = src_r1[0] >> 16 | src_r1[1] << 16; - dst1[2] = src_r1[1] >> 16 | src_r1[2] << 16; - dst1[3] = src_r1[2] >> 16 | src_r1[3] << 16; - dst2[0] = src_r1[3] >> 16; + w3[2] = amd_bytealign_S ( 0, w2[3], offset_minus_4); + w3[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w3[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w2[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w2[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w2[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w2[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w1[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w1[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w1[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w1[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w0[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w0[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w0[2] = w0[3]; + w0[3] = w1[0]; + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = 0; + } + break; case 3: - dst0[0] = src_l0[0] | src_r0[0] << 24; - dst0[1] = src_r0[0] >> 8 | src_r0[1] << 24; - dst0[2] = src_r0[1] >> 8 | src_r0[2] << 24; - dst0[3] = src_r0[2] >> 8 | src_r0[3] << 24; - dst1[0] = src_r0[3] >> 8 | src_r1[0] << 24; - dst1[1] = src_r1[0] >> 8 | src_r1[1] << 24; - dst1[2] = src_r1[1] >> 8 | src_r1[2] << 24; - dst1[3] = src_r1[2] >> 8 | src_r1[3] << 24; - dst2[0] = src_r1[3] >> 8; + w3[2] = amd_bytealign_S ( 0, w2[2], offset_minus_4); + w3[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w3[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w2[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w2[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w2[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w2[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w1[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w1[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w1[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w1[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w0[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w0[3] = w1[0]; + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = 0; + } + break; case 4: - dst0[1] = src_r0[0]; - dst0[2] = src_r0[1]; - dst0[3] = src_r0[2]; - dst1[0] = src_r0[3]; - dst1[1] = src_r1[0]; - dst1[2] = src_r1[1]; - dst1[3] = src_r1[2]; - dst2[0] = src_r1[3]; + w3[2] = amd_bytealign_S ( 0, w2[1], offset_minus_4); + w3[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w3[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w2[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w2[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w2[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w2[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w1[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w1[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w1[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w1[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = 0; + } + break; case 5: - dst0[1] = src_l0[1] | src_r0[0] << 8; - dst0[2] = src_r0[0] >> 24 | src_r0[1] << 8; - dst0[3] = src_r0[1] >> 24 | src_r0[2] << 8; - dst1[0] = src_r0[2] >> 24 | src_r0[3] << 8; - dst1[1] = src_r0[3] >> 24 | src_r1[0] << 8; - dst1[2] = src_r1[0] >> 24 | src_r1[1] << 8; - dst1[3] = src_r1[1] >> 24 | src_r1[2] << 8; - dst2[0] = src_r1[2] >> 24 | src_r1[3] << 8; - dst2[1] = src_r1[3] >> 24; + w3[2] = amd_bytealign_S ( 0, w2[0], offset_minus_4); + w3[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w3[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w2[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w2[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w2[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w2[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w1[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w1[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w1[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = 0; + } + break; case 6: - dst0[1] = src_l0[1] | src_r0[0] << 16; - dst0[2] = src_r0[0] >> 16 | src_r0[1] << 16; - dst0[3] = src_r0[1] >> 16 | src_r0[2] << 16; - dst1[0] = src_r0[2] >> 16 | src_r0[3] << 16; - dst1[1] = src_r0[3] >> 16 | src_r1[0] << 16; - dst1[2] = src_r1[0] >> 16 | src_r1[1] << 16; - dst1[3] = src_r1[1] >> 16 | src_r1[2] << 16; - dst2[0] = src_r1[2] >> 16 | src_r1[3] << 16; - dst2[1] = src_r1[3] >> 16; + w3[2] = amd_bytealign_S ( 0, w1[3], offset_minus_4); + w3[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w3[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w2[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w2[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w2[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w2[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w1[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w1[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = 0; + } + break; case 7: - dst0[1] = src_l0[1] | src_r0[0] << 24; - dst0[2] = src_r0[0] >> 8 | src_r0[1] << 24; - dst0[3] = src_r0[1] >> 8 | src_r0[2] << 24; - dst1[0] = src_r0[2] >> 8 | src_r0[3] << 24; - dst1[1] = src_r0[3] >> 8 | src_r1[0] << 24; - dst1[2] = src_r1[0] >> 8 | src_r1[1] << 24; - dst1[3] = src_r1[1] >> 8 | src_r1[2] << 24; - dst2[0] = src_r1[2] >> 8 | src_r1[3] << 24; - dst2[1] = src_r1[3] >> 8; + w3[2] = amd_bytealign_S ( 0, w1[2], offset_minus_4); + w3[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w3[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w2[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w2[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w2[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w2[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w1[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = 0; + } + break; case 8: - dst0[2] = src_r0[0]; - dst0[3] = src_r0[1]; - dst1[0] = src_r0[2]; - dst1[1] = src_r0[3]; - dst1[2] = src_r1[0]; - dst1[3] = src_r1[1]; - dst2[0] = src_r1[2]; - dst2[1] = src_r1[3]; + w3[2] = amd_bytealign_S ( 0, w1[1], offset_minus_4); + w3[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w3[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w2[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w2[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w2[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w2[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = 0; + } + break; case 9: - dst0[2] = src_l0[2] | src_r0[0] << 8; - dst0[3] = src_r0[0] >> 24 | src_r0[1] << 8; - dst1[0] = src_r0[1] >> 24 | src_r0[2] << 8; - dst1[1] = src_r0[2] >> 24 | src_r0[3] << 8; - dst1[2] = src_r0[3] >> 24 | src_r1[0] << 8; - dst1[3] = src_r1[0] >> 24 | src_r1[1] << 8; - dst2[0] = src_r1[1] >> 24 | src_r1[2] << 8; - dst2[1] = src_r1[2] >> 24 | src_r1[3] << 8; - dst2[2] = src_r1[3] >> 24; + w3[2] = amd_bytealign_S ( 0, w1[0], offset_minus_4); + w3[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w3[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w2[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w2[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w2[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = 0; + } + break; case 10: - dst0[2] = src_l0[2] | src_r0[0] << 16; - dst0[3] = src_r0[0] >> 16 | src_r0[1] << 16; - dst1[0] = src_r0[1] >> 16 | src_r0[2] << 16; - dst1[1] = src_r0[2] >> 16 | src_r0[3] << 16; - dst1[2] = src_r0[3] >> 16 | src_r1[0] << 16; - dst1[3] = src_r1[0] >> 16 | src_r1[1] << 16; - dst2[0] = src_r1[1] >> 16 | src_r1[2] << 16; - dst2[1] = src_r1[2] >> 16 | src_r1[3] << 16; - dst2[2] = src_r1[3] >> 16; + w3[2] = amd_bytealign_S ( 0, w0[3], offset_minus_4); + w3[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w3[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w2[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w2[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = 0; + } + break; case 11: - dst0[2] = src_l0[2] | src_r0[0] << 24; - dst0[3] = src_r0[0] >> 8 | src_r0[1] << 24; - dst1[0] = src_r0[1] >> 8 | src_r0[2] << 24; - dst1[1] = src_r0[2] >> 8 | src_r0[3] << 24; - dst1[2] = src_r0[3] >> 8 | src_r1[0] << 24; - dst1[3] = src_r1[0] >> 8 | src_r1[1] << 24; - dst2[0] = src_r1[1] >> 8 | src_r1[2] << 24; - dst2[1] = src_r1[2] >> 8 | src_r1[3] << 24; - dst2[2] = src_r1[3] >> 8; + w3[2] = amd_bytealign_S ( 0, w0[2], offset_minus_4); + w3[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w3[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w2[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = 0; + } + break; case 12: - dst0[3] = src_r0[0]; - dst1[0] = src_r0[1]; - dst1[1] = src_r0[2]; - dst1[2] = src_r0[3]; - dst1[3] = src_r1[0]; - dst2[0] = src_r1[1]; - dst2[1] = src_r1[2]; - dst2[2] = src_r1[3]; + w3[2] = amd_bytealign_S ( 0, w0[1], offset_minus_4); + w3[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w3[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = 0; + } + break; case 13: - dst0[3] = src_l0[3] | src_r0[0] << 8; - dst1[0] = src_r0[0] >> 24 | src_r0[1] << 8; - dst1[1] = src_r0[1] >> 24 | src_r0[2] << 8; - dst1[2] = src_r0[2] >> 24 | src_r0[3] << 8; - dst1[3] = src_r0[3] >> 24 | src_r1[0] << 8; - dst2[0] = src_r1[0] >> 24 | src_r1[1] << 8; - dst2[1] = src_r1[1] >> 24 | src_r1[2] << 8; - dst2[2] = src_r1[2] >> 24 | src_r1[3] << 8; - dst2[3] = src_r1[3] >> 24; - break; + w3[2] = amd_bytealign_S ( 0, w0[0], offset_minus_4); + w3[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 14: - dst0[3] = src_l0[3] | src_r0[0] << 16; - dst1[0] = src_r0[0] >> 16 | src_r0[1] << 16; - dst1[1] = src_r0[1] >> 16 | src_r0[2] << 16; - dst1[2] = src_r0[2] >> 16 | src_r0[3] << 16; - dst1[3] = src_r0[3] >> 16 | src_r1[0] << 16; - dst2[0] = src_r1[0] >> 16 | src_r1[1] << 16; - dst2[1] = src_r1[1] >> 16 | src_r1[2] << 16; - dst2[2] = src_r1[2] >> 16 | src_r1[3] << 16; - dst2[3] = src_r1[3] >> 16; - break; + if (offset_mod_4 == 0) + { + w3[1] = w3[2]; + w3[2] = 0; + } - case 15: - dst0[3] = src_l0[3] | src_r0[0] << 24; - dst1[0] = src_r0[0] >> 8 | src_r0[1] << 24; - dst1[1] = src_r0[1] >> 8 | src_r0[2] << 24; - dst1[2] = src_r0[2] >> 8 | src_r0[3] << 24; - dst1[3] = src_r0[3] >> 8 | src_r1[0] << 24; - dst2[0] = src_r1[0] >> 8 | src_r1[1] << 24; - dst2[1] = src_r1[1] >> 8 | src_r1[2] << 24; - dst2[2] = src_r1[2] >> 8 | src_r1[3] << 24; - dst2[3] = src_r1[3] >> 8; - break; - - case 16: - dst1[0] = src_r0[0]; - dst1[1] = src_r0[1]; - dst1[2] = src_r0[2]; - dst1[3] = src_r0[3]; - dst2[0] = src_r1[0]; - dst2[1] = src_r1[1]; - dst2[2] = src_r1[2]; - dst2[3] = src_r1[3]; - break; - - case 17: - dst1[0] = src_l1[0] | src_r0[0] << 8; - dst1[1] = src_r0[0] >> 24 | src_r0[1] << 8; - dst1[2] = src_r0[1] >> 24 | src_r0[2] << 8; - dst1[3] = src_r0[2] >> 24 | src_r0[3] << 8; - dst2[0] = src_r0[3] >> 24 | src_r1[0] << 8; - dst2[1] = src_r1[0] >> 24 | src_r1[1] << 8; - dst2[2] = src_r1[1] >> 24 | src_r1[2] << 8; - dst2[3] = src_r1[2] >> 24 | src_r1[3] << 8; - break; - - case 18: - dst1[0] = src_l1[0] | src_r0[0] << 16; - dst1[1] = src_r0[0] >> 16 | src_r0[1] << 16; - dst1[2] = src_r0[1] >> 16 | src_r0[2] << 16; - dst1[3] = src_r0[2] >> 16 | src_r0[3] << 16; - dst2[0] = src_r0[3] >> 16 | src_r1[0] << 16; - dst2[1] = src_r1[0] >> 16 | src_r1[1] << 16; - dst2[2] = src_r1[1] >> 16 | src_r1[2] << 16; - dst2[3] = src_r1[2] >> 16 | src_r1[3] << 16; - break; - - case 19: - dst1[0] = src_l1[0] | src_r0[0] << 24; - dst1[1] = src_r0[0] >> 8 | src_r0[1] << 24; - dst1[2] = src_r0[1] >> 8 | src_r0[2] << 24; - dst1[3] = src_r0[2] >> 8 | src_r0[3] << 24; - dst2[0] = src_r0[3] >> 8 | src_r1[0] << 24; - dst2[1] = src_r1[0] >> 8 | src_r1[1] << 24; - dst2[2] = src_r1[1] >> 8 | src_r1[2] << 24; - dst2[3] = src_r1[2] >> 8 | src_r1[3] << 24; - break; - - case 20: - dst1[1] = src_r1[0]; - dst1[2] = src_r0[1]; - dst1[3] = src_r0[2]; - dst2[0] = src_r0[3]; - dst2[1] = src_r1[0]; - dst2[2] = src_r1[1]; - dst2[3] = src_r1[2]; - break; - - case 21: - dst1[1] = src_l1[1] | src_r0[0] << 8; - dst1[2] = src_r0[0] >> 24 | src_r0[1] << 8; - dst1[3] = src_r0[1] >> 24 | src_r0[2] << 8; - dst2[0] = src_r0[2] >> 24 | src_r0[3] << 8; - dst2[1] = src_r0[3] >> 24 | src_r1[0] << 8; - dst2[2] = src_r1[0] >> 24 | src_r1[1] << 8; - dst2[3] = src_r1[1] >> 24 | src_r1[2] << 8; - break; - - case 22: - dst1[1] = src_l1[1] | src_r0[0] << 16; - dst1[2] = src_r0[0] >> 16 | src_r0[1] << 16; - dst1[3] = src_r0[1] >> 16 | src_r0[2] << 16; - dst2[0] = src_r0[2] >> 16 | src_r0[3] << 16; - dst2[1] = src_r0[3] >> 16 | src_r1[0] << 16; - dst2[2] = src_r1[0] >> 16 | src_r1[1] << 16; - dst2[3] = src_r1[1] >> 16 | src_r1[2] << 16; - break; - - case 23: - dst1[1] = src_l1[1] | src_r0[0] << 24; - dst1[2] = src_r0[0] >> 8 | src_r0[1] << 24; - dst1[3] = src_r0[1] >> 8 | src_r0[2] << 24; - dst2[0] = src_r0[2] >> 8 | src_r0[3] << 24; - dst2[1] = src_r0[3] >> 8 | src_r1[0] << 24; - dst2[2] = src_r1[0] >> 8 | src_r1[1] << 24; - dst2[3] = src_r1[1] >> 8 | src_r1[2] << 24; - break; - - case 24: - dst1[2] = src_r1[0]; - dst1[3] = src_r0[1]; - dst2[0] = src_r0[2]; - dst2[1] = src_r0[3]; - dst2[2] = src_r1[0]; - dst2[3] = src_r1[1]; - break; - - case 25: - dst1[2] = src_l1[2] | src_r0[0] << 8; - dst1[3] = src_r0[0] >> 24 | src_r0[1] << 8; - dst2[0] = src_r0[1] >> 24 | src_r0[2] << 8; - dst2[1] = src_r0[2] >> 24 | src_r0[3] << 8; - dst2[2] = src_r0[3] >> 24 | src_r1[0] << 8; - dst2[3] = src_r1[0] >> 24 | src_r1[1] << 8; - break; - - case 26: - dst1[2] = src_l1[2] | src_r0[0] << 16; - dst1[3] = src_r0[0] >> 16 | src_r0[1] << 16; - dst2[0] = src_r0[1] >> 16 | src_r0[2] << 16; - dst2[1] = src_r0[2] >> 16 | src_r0[3] << 16; - dst2[2] = src_r0[3] >> 16 | src_r1[0] << 16; - dst2[3] = src_r1[0] >> 16 | src_r1[1] << 16; - break; - - case 27: - dst1[2] = src_l1[2] | src_r0[0] << 24; - dst1[3] = src_r0[0] >> 8 | src_r0[1] << 24; - dst2[0] = src_r0[1] >> 8 | src_r0[2] << 24; - dst2[1] = src_r0[2] >> 8 | src_r0[3] << 24; - dst2[2] = src_r0[3] >> 8 | src_r1[0] << 24; - dst2[3] = src_r1[0] >> 8 | src_r1[1] << 24; - break; - - case 28: - dst1[3] = src_r1[0]; - dst2[0] = src_r0[1]; - dst2[1] = src_r0[2]; - dst2[2] = src_r0[3]; - dst2[3] = src_r1[0]; - break; - - case 29: - dst1[3] = src_l1[3] | src_r0[0] << 8; - dst2[0] = src_r0[0] >> 24 | src_r0[1] << 8; - dst2[1] = src_r0[1] >> 24 | src_r0[2] << 8; - dst2[2] = src_r0[2] >> 24 | src_r0[3] << 8; - dst2[3] = src_r0[3] >> 24 | src_r1[0] << 8; - break; - - case 30: - dst1[3] = src_l1[3] | src_r0[0] << 16; - dst2[0] = src_r0[0] >> 16 | src_r0[1] << 16; - dst2[1] = src_r0[1] >> 16 | src_r0[2] << 16; - dst2[2] = src_r0[2] >> 16 | src_r0[3] << 16; - dst2[3] = src_r0[3] >> 16 | src_r1[0] << 16; - break; - - case 31: - dst1[3] = src_l1[3] | src_r0[0] << 24; - dst2[0] = src_r0[0] >> 8 | src_r0[1] << 24; - dst2[1] = src_r0[1] >> 8 | src_r0[2] << 24; - dst2[2] = src_r0[2] >> 8 | src_r0[3] << 24; - dst2[3] = src_r0[3] >> 8 | src_r1[0] << 24; - break; - - case 32: - dst2[0] = src_r0[0]; - dst2[1] = src_r0[1]; - dst2[2] = src_r0[2]; - dst2[3] = src_r0[3]; - break; - - case 33: - dst2[0] = src_l2[0] | src_r0[0] << 8; - dst2[1] = src_r0[0] >> 24 | src_r0[1] << 8; - dst2[2] = src_r0[1] >> 24 | src_r0[2] << 8; - dst2[3] = src_r0[2] >> 24 | src_r0[3] << 8; - break; - - case 34: - dst2[0] = src_l2[0] | src_r0[0] << 16; - dst2[1] = src_r0[0] >> 16 | src_r0[1] << 16; - dst2[2] = src_r0[1] >> 16 | src_r0[2] << 16; - dst2[3] = src_r0[2] >> 16 | src_r0[3] << 16; - break; - - case 35: - dst2[0] = src_l2[0] | src_r0[0] << 24; - dst2[1] = src_r0[0] >> 8 | src_r0[1] << 24; - dst2[2] = src_r0[1] >> 8 | src_r0[2] << 24; - dst2[3] = src_r0[2] >> 8 | src_r0[3] << 24; - break; - - case 36: - dst2[1] = src_r0[0]; - dst2[2] = src_r0[1]; - dst2[3] = src_r0[2]; - break; - - case 37: - dst2[1] = src_l2[1] | src_r0[0] << 8; - dst2[2] = src_r0[0] >> 24 | src_r0[1] << 8; - dst2[3] = src_r0[1] >> 24 | src_r0[2] << 8; - break; - - case 38: - dst2[1] = src_l2[1] | src_r0[0] << 16; - dst2[2] = src_r0[0] >> 16 | src_r0[1] << 16; - dst2[3] = src_r0[1] >> 16 | src_r0[2] << 16; - break; - - case 39: - dst2[1] = src_l2[1] | src_r0[0] << 24; - dst2[2] = src_r0[0] >> 8 | src_r0[1] << 24; - dst2[3] = src_r0[1] >> 8 | src_r0[2] << 24; - break; - - case 40: - dst2[2] = src_r0[0]; - dst2[3] = src_r0[1]; - break; - - case 41: - dst2[2] = src_l2[2] | src_r0[0] << 8; - dst2[3] = src_r0[0] >> 24 | src_r0[1] << 8; - break; - - case 42: - dst2[2] = src_l2[2] | src_r0[0] << 16; - dst2[3] = src_r0[0] >> 16 | src_r0[1] << 16; - break; - - case 43: - dst2[2] = src_l2[2] | src_r0[0] << 24; - dst2[3] = src_r0[0] >> 8 | src_r0[1] << 24; - break; - - case 44: - dst2[3] = src_r0[0]; - break; - - case 45: - dst2[3] = src_l2[3] | src_r0[0] << 8; - break; - - case 46: - dst2[3] = src_l2[3] | src_r0[0] << 16; - break; - - case 47: - dst2[3] = src_l2[3] | src_r0[0] << 24; break; } -} + #endif -// before: memcat16_9 -static void memcat_c15_w4x4_a3x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 append0[4], const u32 append1[4], const u32 append2[4], const u32 offset) -{ - switch (offset) + #ifdef IS_NV + const int offset_minus_4 = 4 - (offset % 4); + + const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; + + switch (offset / 4) { case 0: - w0[0] = append0[0]; - w0[1] = append0[1]; - w0[2] = append0[2]; - w0[3] = append0[3]; - w1[0] = append1[0]; - w1[1] = append1[1]; - w1[2] = append1[2]; - w1[3] = append1[3]; - w2[0] = append2[0]; + w3[1] = __byte_perm_S (w3[0], w3[1], selector); + w3[0] = __byte_perm_S (w2[3], w3[0], selector); + w2[3] = __byte_perm_S (w2[2], w2[3], selector); + w2[2] = __byte_perm_S (w2[1], w2[2], selector); + w2[1] = __byte_perm_S (w2[0], w2[1], selector); + w2[0] = __byte_perm_S (w1[3], w2[0], selector); + w1[3] = __byte_perm_S (w1[2], w1[3], selector); + w1[2] = __byte_perm_S (w1[1], w1[2], selector); + w1[1] = __byte_perm_S (w1[0], w1[1], selector); + w1[0] = __byte_perm_S (w0[3], w1[0], selector); + w0[3] = __byte_perm_S (w0[2], w0[3], selector); + w0[2] = __byte_perm_S (w0[1], w0[2], selector); + w0[1] = __byte_perm_S (w0[0], w0[1], selector); + w0[0] = __byte_perm_S ( 0, w0[0], selector); + break; case 1: - w0[0] = w0[0] | append0[0] << 8; - w0[1] = append0[0] >> 24 | append0[1] << 8; - w0[2] = append0[1] >> 24 | append0[2] << 8; - w0[3] = append0[2] >> 24 | append0[3] << 8; - w1[0] = append0[3] >> 24 | append1[0] << 8; - w1[1] = append1[0] >> 24 | append1[1] << 8; - w1[2] = append1[1] >> 24 | append1[2] << 8; - w1[3] = append1[2] >> 24 | append1[3] << 8; - w2[0] = append1[3] >> 24 | append2[0] << 8; - w2[1] = append2[0] >> 24; + w3[1] = __byte_perm_S (w2[3], w3[0], selector); + w3[0] = __byte_perm_S (w2[2], w2[3], selector); + w2[3] = __byte_perm_S (w2[1], w2[2], selector); + w2[2] = __byte_perm_S (w2[0], w2[1], selector); + w2[1] = __byte_perm_S (w1[3], w2[0], selector); + w2[0] = __byte_perm_S (w1[2], w1[3], selector); + w1[3] = __byte_perm_S (w1[1], w1[2], selector); + w1[2] = __byte_perm_S (w1[0], w1[1], selector); + w1[1] = __byte_perm_S (w0[3], w1[0], selector); + w1[0] = __byte_perm_S (w0[2], w0[3], selector); + w0[3] = __byte_perm_S (w0[1], w0[2], selector); + w0[2] = __byte_perm_S (w0[0], w0[1], selector); + w0[1] = __byte_perm_S ( 0, w0[0], selector); + w0[0] = 0; + break; case 2: - w0[0] = w0[0] | append0[0] << 16; - w0[1] = append0[0] >> 16 | append0[1] << 16; - w0[2] = append0[1] >> 16 | append0[2] << 16; - w0[3] = append0[2] >> 16 | append0[3] << 16; - w1[0] = append0[3] >> 16 | append1[0] << 16; - w1[1] = append1[0] >> 16 | append1[1] << 16; - w1[2] = append1[1] >> 16 | append1[2] << 16; - w1[3] = append1[2] >> 16 | append1[3] << 16; - w2[0] = append1[3] >> 16 | append2[0] << 16; - w2[1] = append2[0] >> 16; + w3[1] = __byte_perm_S (w2[2], w2[3], selector); + w3[0] = __byte_perm_S (w2[1], w2[2], selector); + w2[3] = __byte_perm_S (w2[0], w2[1], selector); + w2[2] = __byte_perm_S (w1[3], w2[0], selector); + w2[1] = __byte_perm_S (w1[2], w1[3], selector); + w2[0] = __byte_perm_S (w1[1], w1[2], selector); + w1[3] = __byte_perm_S (w1[0], w1[1], selector); + w1[2] = __byte_perm_S (w0[3], w1[0], selector); + w1[1] = __byte_perm_S (w0[2], w0[3], selector); + w1[0] = __byte_perm_S (w0[1], w0[2], selector); + w0[3] = __byte_perm_S (w0[0], w0[1], selector); + w0[2] = __byte_perm_S ( 0, w0[0], selector); + w0[1] = 0; + w0[0] = 0; + break; case 3: - w0[0] = w0[0] | append0[0] << 24; - w0[1] = append0[0] >> 8 | append0[1] << 24; - w0[2] = append0[1] >> 8 | append0[2] << 24; - w0[3] = append0[2] >> 8 | append0[3] << 24; - w1[0] = append0[3] >> 8 | append1[0] << 24; - w1[1] = append1[0] >> 8 | append1[1] << 24; - w1[2] = append1[1] >> 8 | append1[2] << 24; - w1[3] = append1[2] >> 8 | append1[3] << 24; - w2[0] = append1[3] >> 8 | append2[0] << 24; - w2[1] = append2[0] >> 8; + w3[1] = __byte_perm_S (w2[1], w2[2], selector); + w3[0] = __byte_perm_S (w2[0], w2[1], selector); + w2[3] = __byte_perm_S (w1[3], w2[0], selector); + w2[2] = __byte_perm_S (w1[2], w1[3], selector); + w2[1] = __byte_perm_S (w1[1], w1[2], selector); + w2[0] = __byte_perm_S (w1[0], w1[1], selector); + w1[3] = __byte_perm_S (w0[3], w1[0], selector); + w1[2] = __byte_perm_S (w0[2], w0[3], selector); + w1[1] = __byte_perm_S (w0[1], w0[2], selector); + w1[0] = __byte_perm_S (w0[0], w0[1], selector); + w0[3] = __byte_perm_S ( 0, w0[0], selector); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; case 4: - w0[1] = append0[0]; - w0[2] = append0[1]; - w0[3] = append0[2]; - w1[0] = append0[3]; - w1[1] = append1[0]; - w1[2] = append1[1]; - w1[3] = append1[2]; - w2[0] = append1[3]; - w2[1] = append2[0]; + w3[1] = __byte_perm_S (w2[0], w2[1], selector); + w3[0] = __byte_perm_S (w1[3], w2[0], selector); + w2[3] = __byte_perm_S (w1[2], w1[3], selector); + w2[2] = __byte_perm_S (w1[1], w1[2], selector); + w2[1] = __byte_perm_S (w1[0], w1[1], selector); + w2[0] = __byte_perm_S (w0[3], w1[0], selector); + w1[3] = __byte_perm_S (w0[2], w0[3], selector); + w1[2] = __byte_perm_S (w0[1], w0[2], selector); + w1[1] = __byte_perm_S (w0[0], w0[1], selector); + w1[0] = __byte_perm_S ( 0, w0[0], selector); + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; case 5: - w0[1] = w0[1] | append0[0] << 8; - w0[2] = append0[0] >> 24 | append0[1] << 8; - w0[3] = append0[1] >> 24 | append0[2] << 8; - w1[0] = append0[2] >> 24 | append0[3] << 8; - w1[1] = append0[3] >> 24 | append1[0] << 8; - w1[2] = append1[0] >> 24 | append1[1] << 8; - w1[3] = append1[1] >> 24 | append1[2] << 8; - w2[0] = append1[2] >> 24 | append1[3] << 8; - w2[1] = append1[3] >> 24 | append2[0] << 8; - w2[2] = append2[0] >> 24; + w3[1] = __byte_perm_S (w1[3], w2[0], selector); + w3[0] = __byte_perm_S (w1[2], w1[3], selector); + w2[3] = __byte_perm_S (w1[1], w1[2], selector); + w2[2] = __byte_perm_S (w1[0], w1[1], selector); + w2[1] = __byte_perm_S (w0[3], w1[0], selector); + w2[0] = __byte_perm_S (w0[2], w0[3], selector); + w1[3] = __byte_perm_S (w0[1], w0[2], selector); + w1[2] = __byte_perm_S (w0[0], w0[1], selector); + w1[1] = __byte_perm_S ( 0, w0[0], selector); + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; case 6: - w0[1] = w0[1] | append0[0] << 16; - w0[2] = append0[0] >> 16 | append0[1] << 16; - w0[3] = append0[1] >> 16 | append0[2] << 16; - w1[0] = append0[2] >> 16 | append0[3] << 16; - w1[1] = append0[3] >> 16 | append1[0] << 16; - w1[2] = append1[0] >> 16 | append1[1] << 16; - w1[3] = append1[1] >> 16 | append1[2] << 16; - w2[0] = append1[2] >> 16 | append1[3] << 16; - w2[1] = append1[3] >> 16 | append2[0] << 16; - w2[2] = append2[0] >> 16; + w3[1] = __byte_perm_S (w1[2], w1[3], selector); + w3[0] = __byte_perm_S (w1[1], w1[2], selector); + w2[3] = __byte_perm_S (w1[0], w1[1], selector); + w2[2] = __byte_perm_S (w0[3], w1[0], selector); + w2[1] = __byte_perm_S (w0[2], w0[3], selector); + w2[0] = __byte_perm_S (w0[1], w0[2], selector); + w1[3] = __byte_perm_S (w0[0], w0[1], selector); + w1[2] = __byte_perm_S ( 0, w0[0], selector); + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; case 7: - w0[1] = w0[1] | append0[0] << 24; - w0[2] = append0[0] >> 8 | append0[1] << 24; - w0[3] = append0[1] >> 8 | append0[2] << 24; - w1[0] = append0[2] >> 8 | append0[3] << 24; - w1[1] = append0[3] >> 8 | append1[0] << 24; - w1[2] = append1[0] >> 8 | append1[1] << 24; - w1[3] = append1[1] >> 8 | append1[2] << 24; - w2[0] = append1[2] >> 8 | append1[3] << 24; - w2[1] = append1[3] >> 8 | append2[0] << 24; - w2[2] = append2[0] >> 8; + w3[1] = __byte_perm_S (w1[1], w1[2], selector); + w3[0] = __byte_perm_S (w1[0], w1[1], selector); + w2[3] = __byte_perm_S (w0[3], w1[0], selector); + w2[2] = __byte_perm_S (w0[2], w0[3], selector); + w2[1] = __byte_perm_S (w0[1], w0[2], selector); + w2[0] = __byte_perm_S (w0[0], w0[1], selector); + w1[3] = __byte_perm_S ( 0, w0[0], selector); + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; case 8: - w0[2] = append0[0]; - w0[3] = append0[1]; - w1[0] = append0[2]; - w1[1] = append0[3]; - w1[2] = append1[0]; - w1[3] = append1[1]; - w2[0] = append1[2]; - w2[1] = append1[3]; - w2[2] = append2[0]; + w3[1] = __byte_perm_S (w1[0], w1[1], selector); + w3[0] = __byte_perm_S (w0[3], w1[0], selector); + w2[3] = __byte_perm_S (w0[2], w0[3], selector); + w2[2] = __byte_perm_S (w0[1], w0[2], selector); + w2[1] = __byte_perm_S (w0[0], w0[1], selector); + w2[0] = __byte_perm_S ( 0, w0[0], selector); + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; case 9: - w0[2] = w0[2] | append0[0] << 8; - w0[3] = append0[0] >> 24 | append0[1] << 8; - w1[0] = append0[1] >> 24 | append0[2] << 8; - w1[1] = append0[2] >> 24 | append0[3] << 8; - w1[2] = append0[3] >> 24 | append1[0] << 8; - w1[3] = append1[0] >> 24 | append1[1] << 8; - w2[0] = append1[1] >> 24 | append1[2] << 8; - w2[1] = append1[2] >> 24 | append1[3] << 8; - w2[2] = append1[3] >> 24 | append2[0] << 8; - w2[3] = append2[0] >> 24; + w3[1] = __byte_perm_S (w0[3], w1[0], selector); + w3[0] = __byte_perm_S (w0[2], w0[3], selector); + w2[3] = __byte_perm_S (w0[1], w0[2], selector); + w2[2] = __byte_perm_S (w0[0], w0[1], selector); + w2[1] = __byte_perm_S ( 0, w0[0], selector); + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; case 10: - w0[2] = w0[2] | append0[0] << 16; - w0[3] = append0[0] >> 16 | append0[1] << 16; - w1[0] = append0[1] >> 16 | append0[2] << 16; - w1[1] = append0[2] >> 16 | append0[3] << 16; - w1[2] = append0[3] >> 16 | append1[0] << 16; - w1[3] = append1[0] >> 16 | append1[1] << 16; - w2[0] = append1[1] >> 16 | append1[2] << 16; - w2[1] = append1[2] >> 16 | append1[3] << 16; - w2[2] = append1[3] >> 16 | append2[0] << 16; - w2[3] = append2[0] >> 16; + w3[1] = __byte_perm_S (w0[2], w0[3], selector); + w3[0] = __byte_perm_S (w0[1], w0[2], selector); + w2[3] = __byte_perm_S (w0[0], w0[1], selector); + w2[2] = __byte_perm_S ( 0, w0[0], selector); + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; case 11: - w0[2] = w0[2] | append0[0] << 24; - w0[3] = append0[0] >> 8 | append0[1] << 24; - w1[0] = append0[1] >> 8 | append0[2] << 24; - w1[1] = append0[2] >> 8 | append0[3] << 24; - w1[2] = append0[3] >> 8 | append1[0] << 24; - w1[3] = append1[0] >> 8 | append1[1] << 24; - w2[0] = append1[1] >> 8 | append1[2] << 24; - w2[1] = append1[2] >> 8 | append1[3] << 24; - w2[2] = append1[3] >> 8 | append2[0] << 24; - w2[3] = append2[0] >> 8; + w3[1] = __byte_perm_S (w0[1], w0[2], selector); + w3[0] = __byte_perm_S (w0[0], w0[1], selector); + w2[3] = __byte_perm_S ( 0, w0[0], selector); + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; case 12: - w0[3] = append0[0]; - w1[0] = append0[1]; - w1[1] = append0[2]; - w1[2] = append0[3]; - w1[3] = append1[0]; - w2[0] = append1[1]; - w2[1] = append1[2]; - w2[2] = append1[3]; - w2[3] = append2[0]; + w3[1] = __byte_perm_S (w0[0], w0[1], selector); + w3[0] = __byte_perm_S ( 0, w0[0], selector); + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; case 13: - w0[3] = w0[3] | append0[0] << 8; - w1[0] = append0[0] >> 24 | append0[1] << 8; - w1[1] = append0[1] >> 24 | append0[2] << 8; - w1[2] = append0[2] >> 24 | append0[3] << 8; - w1[3] = append0[3] >> 24 | append1[0] << 8; - w2[0] = append1[0] >> 24 | append1[1] << 8; - w2[1] = append1[1] >> 24 | append1[2] << 8; - w2[2] = append1[2] >> 24 | append1[3] << 8; - w2[3] = append1[3] >> 24 | append2[0] << 8; - w3[0] = append2[0] >> 24; - break; + w3[1] = __byte_perm_S ( 0, w0[0], selector); + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; - case 14: - w0[3] = w0[3] | append0[0] << 16; - w1[0] = append0[0] >> 16 | append0[1] << 16; - w1[1] = append0[1] >> 16 | append0[2] << 16; - w1[2] = append0[2] >> 16 | append0[3] << 16; - w1[3] = append0[3] >> 16 | append1[0] << 16; - w2[0] = append1[0] >> 16 | append1[1] << 16; - w2[1] = append1[1] >> 16 | append1[2] << 16; - w2[2] = append1[2] >> 16 | append1[3] << 16; - w2[3] = append1[3] >> 16 | append2[0] << 16; - w3[0] = append2[0] >> 16; - break; - - case 15: - w0[3] = w0[3] | append0[0] << 24; - w1[0] = append0[0] >> 8 | append0[1] << 24; - w1[1] = append0[1] >> 8 | append0[2] << 24; - w1[2] = append0[2] >> 8 | append0[3] << 24; - w1[3] = append0[3] >> 8 | append1[0] << 24; - w2[0] = append1[0] >> 8 | append1[1] << 24; - w2[1] = append1[1] >> 8 | append1[2] << 24; - w2[2] = append1[2] >> 8 | append1[3] << 24; - w2[3] = append1[3] >> 8 | append2[0] << 24; - w3[0] = append2[0] >> 8; break; } + #endif } -// before: memcat32_8 -static void memcat_c32_w4x4_a2x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 append0[4], const u32 append1[4], const u32 offset) +static void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) { - switch (offset) + #if defined IS_AMD || defined IS_GENERIC + switch (offset / 4) { case 0: - w0[0] = append0[0]; - w0[1] = append0[1]; - w0[2] = append0[2]; - w0[3] = append0[3]; - w1[0] = append1[0]; - w1[1] = append1[1]; - w1[2] = append1[2]; - w1[3] = append1[3]; + w3[2] = amd_bytealign_S (w3[1], 0, offset); + w3[1] = amd_bytealign_S (w3[0], w3[1], offset); + w3[0] = amd_bytealign_S (w2[3], w3[0], offset); + w2[3] = amd_bytealign_S (w2[2], w2[3], offset); + w2[2] = amd_bytealign_S (w2[1], w2[2], offset); + w2[1] = amd_bytealign_S (w2[0], w2[1], offset); + w2[0] = amd_bytealign_S (w1[3], w2[0], offset); + w1[3] = amd_bytealign_S (w1[2], w1[3], offset); + w1[2] = amd_bytealign_S (w1[1], w1[2], offset); + w1[1] = amd_bytealign_S (w1[0], w1[1], offset); + w1[0] = amd_bytealign_S (w0[3], w1[0], offset); + w0[3] = amd_bytealign_S (w0[2], w0[3], offset); + w0[2] = amd_bytealign_S (w0[1], w0[2], offset); + w0[1] = amd_bytealign_S (w0[0], w0[1], offset); + w0[0] = amd_bytealign_S ( 0, w0[0], offset); break; case 1: - w0[0] = w0[0] | append0[0] << 8; - w0[1] = append0[0] >> 24 | append0[1] << 8; - w0[2] = append0[1] >> 24 | append0[2] << 8; - w0[3] = append0[2] >> 24 | append0[3] << 8; - w1[0] = append0[3] >> 24 | append1[0] << 8; - w1[1] = append1[0] >> 24 | append1[1] << 8; - w1[2] = append1[1] >> 24 | append1[2] << 8; - w1[3] = append1[2] >> 24 | append1[3] << 8; - w2[0] = append1[3] >> 24; + w3[2] = amd_bytealign_S (w3[0], 0, offset); + w3[1] = amd_bytealign_S (w2[3], w3[0], offset); + w3[0] = amd_bytealign_S (w2[2], w2[3], offset); + w2[3] = amd_bytealign_S (w2[1], w2[2], offset); + w2[2] = amd_bytealign_S (w2[0], w2[1], offset); + w2[1] = amd_bytealign_S (w1[3], w2[0], offset); + w2[0] = amd_bytealign_S (w1[2], w1[3], offset); + w1[3] = amd_bytealign_S (w1[1], w1[2], offset); + w1[2] = amd_bytealign_S (w1[0], w1[1], offset); + w1[1] = amd_bytealign_S (w0[3], w1[0], offset); + w1[0] = amd_bytealign_S (w0[2], w0[3], offset); + w0[3] = amd_bytealign_S (w0[1], w0[2], offset); + w0[2] = amd_bytealign_S (w0[0], w0[1], offset); + w0[1] = amd_bytealign_S ( 0, w0[0], offset); + w0[0] = 0; break; case 2: - w0[0] = w0[0] | append0[0] << 16; - w0[1] = append0[0] >> 16 | append0[1] << 16; - w0[2] = append0[1] >> 16 | append0[2] << 16; - w0[3] = append0[2] >> 16 | append0[3] << 16; - w1[0] = append0[3] >> 16 | append1[0] << 16; - w1[1] = append1[0] >> 16 | append1[1] << 16; - w1[2] = append1[1] >> 16 | append1[2] << 16; - w1[3] = append1[2] >> 16 | append1[3] << 16; - w2[0] = append1[3] >> 16; + w3[2] = amd_bytealign_S (w2[3], 0, offset); + w3[1] = amd_bytealign_S (w2[2], w2[3], offset); + w3[0] = amd_bytealign_S (w2[1], w2[2], offset); + w2[3] = amd_bytealign_S (w2[0], w2[1], offset); + w2[2] = amd_bytealign_S (w1[3], w2[0], offset); + w2[1] = amd_bytealign_S (w1[2], w1[3], offset); + w2[0] = amd_bytealign_S (w1[1], w1[2], offset); + w1[3] = amd_bytealign_S (w1[0], w1[1], offset); + w1[2] = amd_bytealign_S (w0[3], w1[0], offset); + w1[1] = amd_bytealign_S (w0[2], w0[3], offset); + w1[0] = amd_bytealign_S (w0[1], w0[2], offset); + w0[3] = amd_bytealign_S (w0[0], w0[1], offset); + w0[2] = amd_bytealign_S ( 0, w0[0], offset); + w0[1] = 0; + w0[0] = 0; break; case 3: - w0[0] = w0[0] | append0[0] << 24; - w0[1] = append0[0] >> 8 | append0[1] << 24; - w0[2] = append0[1] >> 8 | append0[2] << 24; - w0[3] = append0[2] >> 8 | append0[3] << 24; - w1[0] = append0[3] >> 8 | append1[0] << 24; - w1[1] = append1[0] >> 8 | append1[1] << 24; - w1[2] = append1[1] >> 8 | append1[2] << 24; - w1[3] = append1[2] >> 8 | append1[3] << 24; - w2[0] = append1[3] >> 8; + w3[2] = amd_bytealign_S (w2[2], 0, offset); + w3[1] = amd_bytealign_S (w2[1], w2[2], offset); + w3[0] = amd_bytealign_S (w2[0], w2[1], offset); + w2[3] = amd_bytealign_S (w1[3], w2[0], offset); + w2[2] = amd_bytealign_S (w1[2], w1[3], offset); + w2[1] = amd_bytealign_S (w1[1], w1[2], offset); + w2[0] = amd_bytealign_S (w1[0], w1[1], offset); + w1[3] = amd_bytealign_S (w0[3], w1[0], offset); + w1[2] = amd_bytealign_S (w0[2], w0[3], offset); + w1[1] = amd_bytealign_S (w0[1], w0[2], offset); + w1[0] = amd_bytealign_S (w0[0], w0[1], offset); + w0[3] = amd_bytealign_S ( 0, w0[0], offset); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; case 4: - w0[1] = append0[0]; - w0[2] = append0[1]; - w0[3] = append0[2]; - w1[0] = append0[3]; - w1[1] = append1[0]; - w1[2] = append1[1]; - w1[3] = append1[2]; - w2[0] = append1[3]; + w3[2] = amd_bytealign_S (w2[1], 0, offset); + w3[1] = amd_bytealign_S (w2[0], w2[1], offset); + w3[0] = amd_bytealign_S (w1[3], w2[0], offset); + w2[3] = amd_bytealign_S (w1[2], w1[3], offset); + w2[2] = amd_bytealign_S (w1[1], w1[2], offset); + w2[1] = amd_bytealign_S (w1[0], w1[1], offset); + w2[0] = amd_bytealign_S (w0[3], w1[0], offset); + w1[3] = amd_bytealign_S (w0[2], w0[3], offset); + w1[2] = amd_bytealign_S (w0[1], w0[2], offset); + w1[1] = amd_bytealign_S (w0[0], w0[1], offset); + w1[0] = amd_bytealign_S ( 0, w0[0], offset); + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; case 5: - w0[1] = w0[1] | append0[0] << 8; - w0[2] = append0[0] >> 24 | append0[1] << 8; - w0[3] = append0[1] >> 24 | append0[2] << 8; - w1[0] = append0[2] >> 24 | append0[3] << 8; - w1[1] = append0[3] >> 24 | append1[0] << 8; - w1[2] = append1[0] >> 24 | append1[1] << 8; - w1[3] = append1[1] >> 24 | append1[2] << 8; - w2[0] = append1[2] >> 24 | append1[3] << 8; - w2[1] = append1[3] >> 24; + w3[2] = amd_bytealign_S (w2[0], 0, offset); + w3[1] = amd_bytealign_S (w1[3], w2[0], offset); + w3[0] = amd_bytealign_S (w1[2], w1[3], offset); + w2[3] = amd_bytealign_S (w1[1], w1[2], offset); + w2[2] = amd_bytealign_S (w1[0], w1[1], offset); + w2[1] = amd_bytealign_S (w0[3], w1[0], offset); + w2[0] = amd_bytealign_S (w0[2], w0[3], offset); + w1[3] = amd_bytealign_S (w0[1], w0[2], offset); + w1[2] = amd_bytealign_S (w0[0], w0[1], offset); + w1[1] = amd_bytealign_S ( 0, w0[0], offset); + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; case 6: - w0[1] = w0[1] | append0[0] << 16; - w0[2] = append0[0] >> 16 | append0[1] << 16; - w0[3] = append0[1] >> 16 | append0[2] << 16; - w1[0] = append0[2] >> 16 | append0[3] << 16; - w1[1] = append0[3] >> 16 | append1[0] << 16; - w1[2] = append1[0] >> 16 | append1[1] << 16; - w1[3] = append1[1] >> 16 | append1[2] << 16; - w2[0] = append1[2] >> 16 | append1[3] << 16; - w2[1] = append1[3] >> 16; + w3[2] = amd_bytealign_S (w1[3], 0, offset); + w3[1] = amd_bytealign_S (w1[2], w1[3], offset); + w3[0] = amd_bytealign_S (w1[1], w1[2], offset); + w2[3] = amd_bytealign_S (w1[0], w1[1], offset); + w2[2] = amd_bytealign_S (w0[3], w1[0], offset); + w2[1] = amd_bytealign_S (w0[2], w0[3], offset); + w2[0] = amd_bytealign_S (w0[1], w0[2], offset); + w1[3] = amd_bytealign_S (w0[0], w0[1], offset); + w1[2] = amd_bytealign_S ( 0, w0[0], offset); + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; case 7: - w0[1] = w0[1] | append0[0] << 24; - w0[2] = append0[0] >> 8 | append0[1] << 24; - w0[3] = append0[1] >> 8 | append0[2] << 24; - w1[0] = append0[2] >> 8 | append0[3] << 24; - w1[1] = append0[3] >> 8 | append1[0] << 24; - w1[2] = append1[0] >> 8 | append1[1] << 24; - w1[3] = append1[1] >> 8 | append1[2] << 24; - w2[0] = append1[2] >> 8 | append1[3] << 24; - w2[1] = append1[3] >> 8; + w3[2] = amd_bytealign_S (w1[2], 0, offset); + w3[1] = amd_bytealign_S (w1[1], w1[2], offset); + w3[0] = amd_bytealign_S (w1[0], w1[1], offset); + w2[3] = amd_bytealign_S (w0[3], w1[0], offset); + w2[2] = amd_bytealign_S (w0[2], w0[3], offset); + w2[1] = amd_bytealign_S (w0[1], w0[2], offset); + w2[0] = amd_bytealign_S (w0[0], w0[1], offset); + w1[3] = amd_bytealign_S ( 0, w0[0], offset); + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; case 8: - w0[2] = append0[0]; - w0[3] = append0[1]; - w1[0] = append0[2]; - w1[1] = append0[3]; - w1[2] = append1[0]; - w1[3] = append1[1]; - w2[0] = append1[2]; - w2[1] = append1[3]; + w3[2] = amd_bytealign_S (w1[1], 0, offset); + w3[1] = amd_bytealign_S (w1[0], w1[1], offset); + w3[0] = amd_bytealign_S (w0[3], w1[0], offset); + w2[3] = amd_bytealign_S (w0[2], w0[3], offset); + w2[2] = amd_bytealign_S (w0[1], w0[2], offset); + w2[1] = amd_bytealign_S (w0[0], w0[1], offset); + w2[0] = amd_bytealign_S ( 0, w0[0], offset); + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; case 9: - w0[2] = w0[2] | append0[0] << 8; - w0[3] = append0[0] >> 24 | append0[1] << 8; - w1[0] = append0[1] >> 24 | append0[2] << 8; - w1[1] = append0[2] >> 24 | append0[3] << 8; - w1[2] = append0[3] >> 24 | append1[0] << 8; - w1[3] = append1[0] >> 24 | append1[1] << 8; - w2[0] = append1[1] >> 24 | append1[2] << 8; - w2[1] = append1[2] >> 24 | append1[3] << 8; - w2[2] = append1[3] >> 24; + w3[2] = amd_bytealign_S (w1[0], 0, offset); + w3[1] = amd_bytealign_S (w0[3], w1[0], offset); + w3[0] = amd_bytealign_S (w0[2], w0[3], offset); + w2[3] = amd_bytealign_S (w0[1], w0[2], offset); + w2[2] = amd_bytealign_S (w0[0], w0[1], offset); + w2[1] = amd_bytealign_S ( 0, w0[0], offset); + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; case 10: - w0[2] = w0[2] | append0[0] << 16; - w0[3] = append0[0] >> 16 | append0[1] << 16; - w1[0] = append0[1] >> 16 | append0[2] << 16; - w1[1] = append0[2] >> 16 | append0[3] << 16; - w1[2] = append0[3] >> 16 | append1[0] << 16; - w1[3] = append1[0] >> 16 | append1[1] << 16; - w2[0] = append1[1] >> 16 | append1[2] << 16; - w2[1] = append1[2] >> 16 | append1[3] << 16; - w2[2] = append1[3] >> 16; + w3[2] = amd_bytealign_S (w0[3], 0, offset); + w3[1] = amd_bytealign_S (w0[2], w0[3], offset); + w3[0] = amd_bytealign_S (w0[1], w0[2], offset); + w2[3] = amd_bytealign_S (w0[0], w0[1], offset); + w2[2] = amd_bytealign_S ( 0, w0[0], offset); + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; case 11: - w0[2] = w0[2] | append0[0] << 24; - w0[3] = append0[0] >> 8 | append0[1] << 24; - w1[0] = append0[1] >> 8 | append0[2] << 24; - w1[1] = append0[2] >> 8 | append0[3] << 24; - w1[2] = append0[3] >> 8 | append1[0] << 24; - w1[3] = append1[0] >> 8 | append1[1] << 24; - w2[0] = append1[1] >> 8 | append1[2] << 24; - w2[1] = append1[2] >> 8 | append1[3] << 24; - w2[2] = append1[3] >> 8; + w3[2] = amd_bytealign_S (w0[2], 0, offset); + w3[1] = amd_bytealign_S (w0[1], w0[2], offset); + w3[0] = amd_bytealign_S (w0[0], w0[1], offset); + w2[3] = amd_bytealign_S ( 0, w0[0], offset); + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; case 12: - w0[3] = append0[0]; - w1[0] = append0[1]; - w1[1] = append0[2]; - w1[2] = append0[3]; - w1[3] = append1[0]; - w2[0] = append1[1]; - w2[1] = append1[2]; - w2[2] = append1[3]; + w3[2] = amd_bytealign_S (w0[1], 0, offset); + w3[1] = amd_bytealign_S (w0[0], w0[1], offset); + w3[0] = amd_bytealign_S ( 0, w0[0], offset); + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; case 13: - w0[3] = w0[3] | append0[0] << 8; - w1[0] = append0[0] >> 24 | append0[1] << 8; - w1[1] = append0[1] >> 24 | append0[2] << 8; - w1[2] = append0[2] >> 24 | append0[3] << 8; - w1[3] = append0[3] >> 24 | append1[0] << 8; - w2[0] = append1[0] >> 24 | append1[1] << 8; - w2[1] = append1[1] >> 24 | append1[2] << 8; - w2[2] = append1[2] >> 24 | append1[3] << 8; - w2[3] = append1[3] >> 24; - break; - - case 14: - w0[3] = w0[3] | append0[0] << 16; - w1[0] = append0[0] >> 16 | append0[1] << 16; - w1[1] = append0[1] >> 16 | append0[2] << 16; - w1[2] = append0[2] >> 16 | append0[3] << 16; - w1[3] = append0[3] >> 16 | append1[0] << 16; - w2[0] = append1[0] >> 16 | append1[1] << 16; - w2[1] = append1[1] >> 16 | append1[2] << 16; - w2[2] = append1[2] >> 16 | append1[3] << 16; - w2[3] = append1[3] >> 16; - break; - - case 15: - w0[3] = w0[3] | append0[0] << 24; - w1[0] = append0[0] >> 8 | append0[1] << 24; - w1[1] = append0[1] >> 8 | append0[2] << 24; - w1[2] = append0[2] >> 8 | append0[3] << 24; - w1[3] = append0[3] >> 8 | append1[0] << 24; - w2[0] = append1[0] >> 8 | append1[1] << 24; - w2[1] = append1[1] >> 8 | append1[2] << 24; - w2[2] = append1[2] >> 8 | append1[3] << 24; - w2[3] = append1[3] >> 8; - break; - - case 16: - w1[0] = append0[0]; - w1[1] = append0[1]; - w1[2] = append0[2]; - w1[3] = append0[3]; - w2[0] = append1[0]; - w2[1] = append1[1]; - w2[2] = append1[2]; - w2[3] = append1[3]; - break; - - case 17: - w1[0] = w1[0] | append0[0] << 8; - w1[1] = append0[0] >> 24 | append0[1] << 8; - w1[2] = append0[1] >> 24 | append0[2] << 8; - w1[3] = append0[2] >> 24 | append0[3] << 8; - w2[0] = append0[3] >> 24 | append1[0] << 8; - w2[1] = append1[0] >> 24 | append1[1] << 8; - w2[2] = append1[1] >> 24 | append1[2] << 8; - w2[3] = append1[2] >> 24 | append1[3] << 8; - w3[0] = append1[3] >> 24; - break; - - case 18: - w1[0] = w1[0] | append0[0] << 16; - w1[1] = append0[0] >> 16 | append0[1] << 16; - w1[2] = append0[1] >> 16 | append0[2] << 16; - w1[3] = append0[2] >> 16 | append0[3] << 16; - w2[0] = append0[3] >> 16 | append1[0] << 16; - w2[1] = append1[0] >> 16 | append1[1] << 16; - w2[2] = append1[1] >> 16 | append1[2] << 16; - w2[3] = append1[2] >> 16 | append1[3] << 16; - w3[0] = append1[3] >> 16; - break; - - case 19: - w1[0] = w1[0] | append0[0] << 24; - w1[1] = append0[0] >> 8 | append0[1] << 24; - w1[2] = append0[1] >> 8 | append0[2] << 24; - w1[3] = append0[2] >> 8 | append0[3] << 24; - w2[0] = append0[3] >> 8 | append1[0] << 24; - w2[1] = append1[0] >> 8 | append1[1] << 24; - w2[2] = append1[1] >> 8 | append1[2] << 24; - w2[3] = append1[2] >> 8 | append1[3] << 24; - w3[0] = append1[3] >> 8; - break; - - case 20: - w1[1] = append0[0]; - w1[2] = append0[1]; - w1[3] = append0[2]; - w2[0] = append0[3]; - w2[1] = append1[0]; - w2[2] = append1[1]; - w2[3] = append1[2]; - w3[0] = append1[3]; - break; - - case 21: - w1[1] = w1[1] | append0[0] << 8; - w1[2] = append0[0] >> 24 | append0[1] << 8; - w1[3] = append0[1] >> 24 | append0[2] << 8; - w2[0] = append0[2] >> 24 | append0[3] << 8; - w2[1] = append0[3] >> 24 | append1[0] << 8; - w2[2] = append1[0] >> 24 | append1[1] << 8; - w2[3] = append1[1] >> 24 | append1[2] << 8; - w3[0] = append1[2] >> 24 | append1[3] << 8; - w3[1] = append1[3] >> 24; - break; - - case 22: - w1[1] = w1[1] | append0[0] << 16; - w1[2] = append0[0] >> 16 | append0[1] << 16; - w1[3] = append0[1] >> 16 | append0[2] << 16; - w2[0] = append0[2] >> 16 | append0[3] << 16; - w2[1] = append0[3] >> 16 | append1[0] << 16; - w2[2] = append1[0] >> 16 | append1[1] << 16; - w2[3] = append1[1] >> 16 | append1[2] << 16; - w3[0] = append1[2] >> 16 | append1[3] << 16; - w3[1] = append1[3] >> 16; - break; - - case 23: - w1[1] = w1[1] | append0[0] << 24; - w1[2] = append0[0] >> 8 | append0[1] << 24; - w1[3] = append0[1] >> 8 | append0[2] << 24; - w2[0] = append0[2] >> 8 | append0[3] << 24; - w2[1] = append0[3] >> 8 | append1[0] << 24; - w2[2] = append1[0] >> 8 | append1[1] << 24; - w2[3] = append1[1] >> 8 | append1[2] << 24; - w3[0] = append1[2] >> 8 | append1[3] << 24; - w3[1] = append1[3] >> 8; - break; - - case 24: - w1[2] = append0[0]; - w1[3] = append0[1]; - w2[0] = append0[2]; - w2[1] = append0[3]; - w2[2] = append1[0]; - w2[3] = append1[1]; - w3[0] = append1[2]; - w3[1] = append1[3]; - break; - - case 25: - w1[2] = w1[2] | append0[0] << 8; - w1[3] = append0[0] >> 24 | append0[1] << 8; - w2[0] = append0[1] >> 24 | append0[2] << 8; - w2[1] = append0[2] >> 24 | append0[3] << 8; - w2[2] = append0[3] >> 24 | append1[0] << 8; - w2[3] = append1[0] >> 24 | append1[1] << 8; - w3[0] = append1[1] >> 24 | append1[2] << 8; - w3[1] = append1[2] >> 24 | append1[3] << 8; - break; - - case 26: - w1[2] = w1[2] | append0[0] << 16; - w1[3] = append0[0] >> 16 | append0[1] << 16; - w2[0] = append0[1] >> 16 | append0[2] << 16; - w2[1] = append0[2] >> 16 | append0[3] << 16; - w2[2] = append0[3] >> 16 | append1[0] << 16; - w2[3] = append1[0] >> 16 | append1[1] << 16; - w3[0] = append1[1] >> 16 | append1[2] << 16; - w3[1] = append1[2] >> 16 | append1[3] << 16; - break; - - case 27: - w1[2] = w1[2] | append0[0] << 24; - w1[3] = append0[0] >> 8 | append0[1] << 24; - w2[0] = append0[1] >> 8 | append0[2] << 24; - w2[1] = append0[2] >> 8 | append0[3] << 24; - w2[2] = append0[3] >> 8 | append1[0] << 24; - w2[3] = append1[0] >> 8 | append1[1] << 24; - w3[0] = append1[1] >> 8 | append1[2] << 24; - w3[1] = append1[2] >> 8 | append1[3] << 24; - break; - - case 28: - w1[3] = append0[0]; - w2[0] = append0[1]; - w2[1] = append0[2]; - w2[2] = append0[3]; - w2[3] = append1[0]; - w3[0] = append1[1]; - w3[1] = append1[2]; - break; - - case 29: - w1[3] = w1[3] | append0[0] << 8; - w2[0] = append0[0] >> 24 | append0[1] << 8; - w2[1] = append0[1] >> 24 | append0[2] << 8; - w2[2] = append0[2] >> 24 | append0[3] << 8; - w2[3] = append0[3] >> 24 | append1[0] << 8; - w3[0] = append1[0] >> 24 | append1[1] << 8; - w3[1] = append1[1] >> 24 | append1[2] << 8; - break; - - case 30: - w1[3] = w1[3] | append0[0] << 16; - w2[0] = append0[0] >> 16 | append0[1] << 16; - w2[1] = append0[1] >> 16 | append0[2] << 16; - w2[2] = append0[2] >> 16 | append0[3] << 16; - w2[3] = append0[3] >> 16 | append1[0] << 16; - w3[0] = append1[0] >> 16 | append1[1] << 16; - w3[1] = append1[1] >> 16 | append1[2] << 16; - break; - - case 31: - w1[3] = w1[3] | append0[0] << 24; - w2[0] = append0[0] >> 8 | append0[1] << 24; - w2[1] = append0[1] >> 8 | append0[2] << 24; - w2[2] = append0[2] >> 8 | append0[3] << 24; - w2[3] = append0[3] >> 8 | append1[0] << 24; - w3[0] = append1[0] >> 8 | append1[1] << 24; - w3[1] = append1[1] >> 8 | append1[2] << 24; - break; - - case 32: - w2[0] = append0[0]; - w2[1] = append0[1]; - w2[2] = append0[2]; - w2[3] = append0[3]; - w3[0] = append1[0]; - w3[1] = append1[1]; + w3[2] = amd_bytealign_S (w0[0], 0, offset); + w3[1] = amd_bytealign_S ( 0, w0[0], offset); + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; } -} + #endif -// before: memcat32_9 -static void memcat_c32_w4x4_a3x4 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 append0[4], const u32 append1[4], const u32 append2[4], const u32 offset) -{ - switch (offset) + #ifdef IS_NV + const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; + + switch (offset / 4) { case 0: - w0[0] = append0[0]; - w0[1] = append0[1]; - w0[2] = append0[2]; - w0[3] = append0[3]; - w1[0] = append1[0]; - w1[1] = append1[1]; - w1[2] = append1[2]; - w1[3] = append1[3]; - w2[0] = append2[0]; + w3[1] = __byte_perm_S (w3[1], w3[0], selector); + w3[0] = __byte_perm_S (w3[0], w2[3], selector); + w2[3] = __byte_perm_S (w2[3], w2[2], selector); + w2[2] = __byte_perm_S (w2[2], w2[1], selector); + w2[1] = __byte_perm_S (w2[1], w2[0], selector); + w2[0] = __byte_perm_S (w2[0], w1[3], selector); + w1[3] = __byte_perm_S (w1[3], w1[2], selector); + w1[2] = __byte_perm_S (w1[2], w1[1], selector); + w1[1] = __byte_perm_S (w1[1], w1[0], selector); + w1[0] = __byte_perm_S (w1[0], w0[3], selector); + w0[3] = __byte_perm_S (w0[3], w0[2], selector); + w0[2] = __byte_perm_S (w0[2], w0[1], selector); + w0[1] = __byte_perm_S (w0[1], w0[0], selector); + w0[0] = __byte_perm_S (w0[0], 0, selector); break; case 1: - w0[0] = w0[0] | append0[0] << 8; - w0[1] = append0[0] >> 24 | append0[1] << 8; - w0[2] = append0[1] >> 24 | append0[2] << 8; - w0[3] = append0[2] >> 24 | append0[3] << 8; - w1[0] = append0[3] >> 24 | append1[0] << 8; - w1[1] = append1[0] >> 24 | append1[1] << 8; - w1[2] = append1[1] >> 24 | append1[2] << 8; - w1[3] = append1[2] >> 24 | append1[3] << 8; - w2[0] = append1[3] >> 24 | append2[0] << 8; - w2[1] = append2[0] >> 24; + w3[1] = __byte_perm_S (w3[0], w2[3], selector); + w3[0] = __byte_perm_S (w2[3], w2[2], selector); + w2[3] = __byte_perm_S (w2[2], w2[1], selector); + w2[2] = __byte_perm_S (w2[1], w2[0], selector); + w2[1] = __byte_perm_S (w2[0], w1[3], selector); + w2[0] = __byte_perm_S (w1[3], w1[2], selector); + w1[3] = __byte_perm_S (w1[2], w1[1], selector); + w1[2] = __byte_perm_S (w1[1], w1[0], selector); + w1[1] = __byte_perm_S (w1[0], w0[3], selector); + w1[0] = __byte_perm_S (w0[3], w0[2], selector); + w0[3] = __byte_perm_S (w0[2], w0[1], selector); + w0[2] = __byte_perm_S (w0[1], w0[0], selector); + w0[1] = __byte_perm_S (w0[0], 0, selector); + w0[0] = 0; break; case 2: - w0[0] = w0[0] | append0[0] << 16; - w0[1] = append0[0] >> 16 | append0[1] << 16; - w0[2] = append0[1] >> 16 | append0[2] << 16; - w0[3] = append0[2] >> 16 | append0[3] << 16; - w1[0] = append0[3] >> 16 | append1[0] << 16; - w1[1] = append1[0] >> 16 | append1[1] << 16; - w1[2] = append1[1] >> 16 | append1[2] << 16; - w1[3] = append1[2] >> 16 | append1[3] << 16; - w2[0] = append1[3] >> 16 | append2[0] << 16; - w2[1] = append2[0] >> 16; + w3[1] = __byte_perm_S (w2[3], w2[2], selector); + w3[0] = __byte_perm_S (w2[2], w2[1], selector); + w2[3] = __byte_perm_S (w2[1], w2[0], selector); + w2[2] = __byte_perm_S (w2[0], w1[3], selector); + w2[1] = __byte_perm_S (w1[3], w1[2], selector); + w2[0] = __byte_perm_S (w1[2], w1[1], selector); + w1[3] = __byte_perm_S (w1[1], w1[0], selector); + w1[2] = __byte_perm_S (w1[0], w0[3], selector); + w1[1] = __byte_perm_S (w0[3], w0[2], selector); + w1[0] = __byte_perm_S (w0[2], w0[1], selector); + w0[3] = __byte_perm_S (w0[1], w0[0], selector); + w0[2] = __byte_perm_S (w0[0], 0, selector); + w0[1] = 0; + w0[0] = 0; break; case 3: - w0[0] = w0[0] | append0[0] << 24; - w0[1] = append0[0] >> 8 | append0[1] << 24; - w0[2] = append0[1] >> 8 | append0[2] << 24; - w0[3] = append0[2] >> 8 | append0[3] << 24; - w1[0] = append0[3] >> 8 | append1[0] << 24; - w1[1] = append1[0] >> 8 | append1[1] << 24; - w1[2] = append1[1] >> 8 | append1[2] << 24; - w1[3] = append1[2] >> 8 | append1[3] << 24; - w2[0] = append1[3] >> 8 | append2[0] << 24; - w2[1] = append2[0] >> 8; + w3[1] = __byte_perm_S (w2[2], w2[1], selector); + w3[0] = __byte_perm_S (w2[1], w2[0], selector); + w2[3] = __byte_perm_S (w2[0], w1[3], selector); + w2[2] = __byte_perm_S (w1[3], w1[2], selector); + w2[1] = __byte_perm_S (w1[2], w1[1], selector); + w2[0] = __byte_perm_S (w1[1], w1[0], selector); + w1[3] = __byte_perm_S (w1[0], w0[3], selector); + w1[2] = __byte_perm_S (w0[3], w0[2], selector); + w1[1] = __byte_perm_S (w0[2], w0[1], selector); + w1[0] = __byte_perm_S (w0[1], w0[0], selector); + w0[3] = __byte_perm_S (w0[0], 0, selector); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; case 4: - w0[1] = append0[0]; - w0[2] = append0[1]; - w0[3] = append0[2]; - w1[0] = append0[3]; - w1[1] = append1[0]; - w1[2] = append1[1]; - w1[3] = append1[2]; - w2[0] = append1[3]; - w2[1] = append2[0]; + w3[1] = __byte_perm_S (w2[1], w2[0], selector); + w3[0] = __byte_perm_S (w2[0], w1[3], selector); + w2[3] = __byte_perm_S (w1[3], w1[2], selector); + w2[2] = __byte_perm_S (w1[2], w1[1], selector); + w2[1] = __byte_perm_S (w1[1], w1[0], selector); + w2[0] = __byte_perm_S (w1[0], w0[3], selector); + w1[3] = __byte_perm_S (w0[3], w0[2], selector); + w1[2] = __byte_perm_S (w0[2], w0[1], selector); + w1[1] = __byte_perm_S (w0[1], w0[0], selector); + w1[0] = __byte_perm_S (w0[0], 0, selector); + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; case 5: - w0[1] = w0[1] | append0[0] << 8; - w0[2] = append0[0] >> 24 | append0[1] << 8; - w0[3] = append0[1] >> 24 | append0[2] << 8; - w1[0] = append0[2] >> 24 | append0[3] << 8; - w1[1] = append0[3] >> 24 | append1[0] << 8; - w1[2] = append1[0] >> 24 | append1[1] << 8; - w1[3] = append1[1] >> 24 | append1[2] << 8; - w2[0] = append1[2] >> 24 | append1[3] << 8; - w2[1] = append1[3] >> 24 | append2[0] << 8; - w2[2] = append2[0] >> 24; + w3[1] = __byte_perm_S (w2[0], w1[3], selector); + w3[0] = __byte_perm_S (w1[3], w1[2], selector); + w2[3] = __byte_perm_S (w1[2], w1[1], selector); + w2[2] = __byte_perm_S (w1[1], w1[0], selector); + w2[1] = __byte_perm_S (w1[0], w0[3], selector); + w2[0] = __byte_perm_S (w0[3], w0[2], selector); + w1[3] = __byte_perm_S (w0[2], w0[1], selector); + w1[2] = __byte_perm_S (w0[1], w0[0], selector); + w1[1] = __byte_perm_S (w0[0], 0, selector); + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; case 6: - w0[1] = w0[1] | append0[0] << 16; - w0[2] = append0[0] >> 16 | append0[1] << 16; - w0[3] = append0[1] >> 16 | append0[2] << 16; - w1[0] = append0[2] >> 16 | append0[3] << 16; - w1[1] = append0[3] >> 16 | append1[0] << 16; - w1[2] = append1[0] >> 16 | append1[1] << 16; - w1[3] = append1[1] >> 16 | append1[2] << 16; - w2[0] = append1[2] >> 16 | append1[3] << 16; - w2[1] = append1[3] >> 16 | append2[0] << 16; - w2[2] = append2[0] >> 16; + w3[1] = __byte_perm_S (w1[3], w1[2], selector); + w3[0] = __byte_perm_S (w1[2], w1[1], selector); + w2[3] = __byte_perm_S (w1[1], w1[0], selector); + w2[2] = __byte_perm_S (w1[0], w0[3], selector); + w2[1] = __byte_perm_S (w0[3], w0[2], selector); + w2[0] = __byte_perm_S (w0[2], w0[1], selector); + w1[3] = __byte_perm_S (w0[1], w0[0], selector); + w1[2] = __byte_perm_S (w0[0], 0, selector); + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; case 7: - w0[1] = w0[1] | append0[0] << 24; - w0[2] = append0[0] >> 8 | append0[1] << 24; - w0[3] = append0[1] >> 8 | append0[2] << 24; - w1[0] = append0[2] >> 8 | append0[3] << 24; - w1[1] = append0[3] >> 8 | append1[0] << 24; - w1[2] = append1[0] >> 8 | append1[1] << 24; - w1[3] = append1[1] >> 8 | append1[2] << 24; - w2[0] = append1[2] >> 8 | append1[3] << 24; - w2[1] = append1[3] >> 8 | append2[0] << 24; - w2[2] = append2[0] >> 8; + w3[1] = __byte_perm_S (w1[2], w1[1], selector); + w3[0] = __byte_perm_S (w1[1], w1[0], selector); + w2[3] = __byte_perm_S (w1[0], w0[3], selector); + w2[2] = __byte_perm_S (w0[3], w0[2], selector); + w2[1] = __byte_perm_S (w0[2], w0[1], selector); + w2[0] = __byte_perm_S (w0[1], w0[0], selector); + w1[3] = __byte_perm_S (w0[0], 0, selector); + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; case 8: - w0[2] = append0[0]; - w0[3] = append0[1]; - w1[0] = append0[2]; - w1[1] = append0[3]; - w1[2] = append1[0]; - w1[3] = append1[1]; - w2[0] = append1[2]; - w2[1] = append1[3]; - w2[2] = append2[0]; + w3[1] = __byte_perm_S (w1[1], w1[0], selector); + w3[0] = __byte_perm_S (w1[0], w0[3], selector); + w2[3] = __byte_perm_S (w0[3], w0[2], selector); + w2[2] = __byte_perm_S (w0[2], w0[1], selector); + w2[1] = __byte_perm_S (w0[1], w0[0], selector); + w2[0] = __byte_perm_S (w0[0], 0, selector); + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; case 9: - w0[2] = w0[2] | append0[0] << 8; - w0[3] = append0[0] >> 24 | append0[1] << 8; - w1[0] = append0[1] >> 24 | append0[2] << 8; - w1[1] = append0[2] >> 24 | append0[3] << 8; - w1[2] = append0[3] >> 24 | append1[0] << 8; - w1[3] = append1[0] >> 24 | append1[1] << 8; - w2[0] = append1[1] >> 24 | append1[2] << 8; - w2[1] = append1[2] >> 24 | append1[3] << 8; - w2[2] = append1[3] >> 24 | append2[0] << 8; - w2[3] = append2[0] >> 24; + w3[1] = __byte_perm_S (w1[0], w0[3], selector); + w3[0] = __byte_perm_S (w0[3], w0[2], selector); + w2[3] = __byte_perm_S (w0[2], w0[1], selector); + w2[2] = __byte_perm_S (w0[1], w0[0], selector); + w2[1] = __byte_perm_S (w0[0], 0, selector); + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; case 10: - w0[2] = w0[2] | append0[0] << 16; - w0[3] = append0[0] >> 16 | append0[1] << 16; - w1[0] = append0[1] >> 16 | append0[2] << 16; - w1[1] = append0[2] >> 16 | append0[3] << 16; - w1[2] = append0[3] >> 16 | append1[0] << 16; - w1[3] = append1[0] >> 16 | append1[1] << 16; - w2[0] = append1[1] >> 16 | append1[2] << 16; - w2[1] = append1[2] >> 16 | append1[3] << 16; - w2[2] = append1[3] >> 16 | append2[0] << 16; - w2[3] = append2[0] >> 16; + w3[1] = __byte_perm_S (w0[3], w0[2], selector); + w3[0] = __byte_perm_S (w0[2], w0[1], selector); + w2[3] = __byte_perm_S (w0[1], w0[0], selector); + w2[2] = __byte_perm_S (w0[0], 0, selector); + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; case 11: - w0[2] = w0[2] | append0[0] << 24; - w0[3] = append0[0] >> 8 | append0[1] << 24; - w1[0] = append0[1] >> 8 | append0[2] << 24; - w1[1] = append0[2] >> 8 | append0[3] << 24; - w1[2] = append0[3] >> 8 | append1[0] << 24; - w1[3] = append1[0] >> 8 | append1[1] << 24; - w2[0] = append1[1] >> 8 | append1[2] << 24; - w2[1] = append1[2] >> 8 | append1[3] << 24; - w2[2] = append1[3] >> 8 | append2[0] << 24; - w2[3] = append2[0] >> 8; + w3[1] = __byte_perm_S (w0[2], w0[1], selector); + w3[0] = __byte_perm_S (w0[1], w0[0], selector); + w2[3] = __byte_perm_S (w0[0], 0, selector); + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; case 12: - w0[3] = append0[0]; - w1[0] = append0[1]; - w1[1] = append0[2]; - w1[2] = append0[3]; - w1[3] = append1[0]; - w2[0] = append1[1]; - w2[1] = append1[2]; - w2[2] = append1[3]; - w2[3] = append2[0]; + w3[1] = __byte_perm_S (w0[1], w0[0], selector); + w3[0] = __byte_perm_S (w0[0], 0, selector); + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; case 13: - w0[3] = w0[3] | append0[0] << 8; - w1[0] = append0[0] >> 24 | append0[1] << 8; - w1[1] = append0[1] >> 24 | append0[2] << 8; - w1[2] = append0[2] >> 24 | append0[3] << 8; - w1[3] = append0[3] >> 24 | append1[0] << 8; - w2[0] = append1[0] >> 24 | append1[1] << 8; - w2[1] = append1[1] >> 24 | append1[2] << 8; - w2[2] = append1[2] >> 24 | append1[3] << 8; - w2[3] = append1[3] >> 24 | append2[0] << 8; - w3[0] = append2[0] >> 24; - break; - - case 14: - w0[3] = w0[3] | append0[0] << 16; - w1[0] = append0[0] >> 16 | append0[1] << 16; - w1[1] = append0[1] >> 16 | append0[2] << 16; - w1[2] = append0[2] >> 16 | append0[3] << 16; - w1[3] = append0[3] >> 16 | append1[0] << 16; - w2[0] = append1[0] >> 16 | append1[1] << 16; - w2[1] = append1[1] >> 16 | append1[2] << 16; - w2[2] = append1[2] >> 16 | append1[3] << 16; - w2[3] = append1[3] >> 16 | append2[0] << 16; - w3[0] = append2[0] >> 16; - break; - - case 15: - w0[3] = w0[3] | append0[0] << 24; - w1[0] = append0[0] >> 8 | append0[1] << 24; - w1[1] = append0[1] >> 8 | append0[2] << 24; - w1[2] = append0[2] >> 8 | append0[3] << 24; - w1[3] = append0[3] >> 8 | append1[0] << 24; - w2[0] = append1[0] >> 8 | append1[1] << 24; - w2[1] = append1[1] >> 8 | append1[2] << 24; - w2[2] = append1[2] >> 8 | append1[3] << 24; - w2[3] = append1[3] >> 8 | append2[0] << 24; - w3[0] = append2[0] >> 8; - break; - - case 16: - w1[0] = append0[0]; - w1[1] = append0[1]; - w1[2] = append0[2]; - w1[3] = append0[3]; - w2[0] = append1[0]; - w2[1] = append1[1]; - w2[2] = append1[2]; - w2[3] = append1[3]; - w3[0] = append2[0]; - break; - - case 17: - w1[0] = w1[0] | append0[0] << 8; - w1[1] = append0[0] >> 24 | append0[1] << 8; - w1[2] = append0[1] >> 24 | append0[2] << 8; - w1[3] = append0[2] >> 24 | append0[3] << 8; - w2[0] = append0[3] >> 24 | append1[0] << 8; - w2[1] = append1[0] >> 24 | append1[1] << 8; - w2[2] = append1[1] >> 24 | append1[2] << 8; - w2[3] = append1[2] >> 24 | append1[3] << 8; - w3[0] = append1[3] >> 24 | append2[0] << 8; - w3[1] = append2[0] >> 24; - break; - - case 18: - w1[0] = w1[0] | append0[0] << 16; - w1[1] = append0[0] >> 16 | append0[1] << 16; - w1[2] = append0[1] >> 16 | append0[2] << 16; - w1[3] = append0[2] >> 16 | append0[3] << 16; - w2[0] = append0[3] >> 16 | append1[0] << 16; - w2[1] = append1[0] >> 16 | append1[1] << 16; - w2[2] = append1[1] >> 16 | append1[2] << 16; - w2[3] = append1[2] >> 16 | append1[3] << 16; - w3[0] = append1[3] >> 16 | append2[0] << 16; - w3[1] = append2[0] >> 16; - break; - - case 19: - w1[0] = w1[0] | append0[0] << 24; - w1[1] = append0[0] >> 8 | append0[1] << 24; - w1[2] = append0[1] >> 8 | append0[2] << 24; - w1[3] = append0[2] >> 8 | append0[3] << 24; - w2[0] = append0[3] >> 8 | append1[0] << 24; - w2[1] = append1[0] >> 8 | append1[1] << 24; - w2[2] = append1[1] >> 8 | append1[2] << 24; - w2[3] = append1[2] >> 8 | append1[3] << 24; - w3[0] = append1[3] >> 8 | append2[0] << 24; - w3[1] = append2[0] >> 8; - break; - - case 20: - w1[1] = append0[0]; - w1[2] = append0[1]; - w1[3] = append0[2]; - w2[0] = append0[3]; - w2[1] = append1[0]; - w2[2] = append1[1]; - w2[3] = append1[2]; - w3[0] = append1[3]; - w3[1] = append2[0]; - break; - - case 21: - w1[1] = w1[1] | append0[0] << 8; - w1[2] = append0[0] >> 24 | append0[1] << 8; - w1[3] = append0[1] >> 24 | append0[2] << 8; - w2[0] = append0[2] >> 24 | append0[3] << 8; - w2[1] = append0[3] >> 24 | append1[0] << 8; - w2[2] = append1[0] >> 24 | append1[1] << 8; - w2[3] = append1[1] >> 24 | append1[2] << 8; - w3[0] = append1[2] >> 24 | append1[3] << 8; - w3[1] = append1[3] >> 24 | append2[0] << 8; - break; - - case 22: - w1[1] = w1[1] | append0[0] << 16; - w1[2] = append0[0] >> 16 | append0[1] << 16; - w1[3] = append0[1] >> 16 | append0[2] << 16; - w2[0] = append0[2] >> 16 | append0[3] << 16; - w2[1] = append0[3] >> 16 | append1[0] << 16; - w2[2] = append1[0] >> 16 | append1[1] << 16; - w2[3] = append1[1] >> 16 | append1[2] << 16; - w3[0] = append1[2] >> 16 | append1[3] << 16; - w3[1] = append1[3] >> 16 | append2[0] << 16; - break; - - case 23: - w1[1] = w1[1] | append0[0] << 24; - w1[2] = append0[0] >> 8 | append0[1] << 24; - w1[3] = append0[1] >> 8 | append0[2] << 24; - w2[0] = append0[2] >> 8 | append0[3] << 24; - w2[1] = append0[3] >> 8 | append1[0] << 24; - w2[2] = append1[0] >> 8 | append1[1] << 24; - w2[3] = append1[1] >> 8 | append1[2] << 24; - w3[0] = append1[2] >> 8 | append1[3] << 24; - w3[1] = append1[3] >> 8 | append2[0] << 24; - break; - - case 24: - w1[2] = append0[0]; - w1[3] = append0[1]; - w2[0] = append0[2]; - w2[1] = append0[3]; - w2[2] = append1[0]; - w2[3] = append1[1]; - w3[0] = append1[2]; - w3[1] = append1[3]; - break; - - case 25: - w1[2] = w1[2] | append0[0] << 8; - w1[3] = append0[0] >> 24 | append0[1] << 8; - w2[0] = append0[1] >> 24 | append0[2] << 8; - w2[1] = append0[2] >> 24 | append0[3] << 8; - w2[2] = append0[3] >> 24 | append1[0] << 8; - w2[3] = append1[0] >> 24 | append1[1] << 8; - w3[0] = append1[1] >> 24 | append1[2] << 8; - w3[1] = append1[2] >> 24 | append1[3] << 8; - break; - - case 26: - w1[2] = w1[2] | append0[0] << 16; - w1[3] = append0[0] >> 16 | append0[1] << 16; - w2[0] = append0[1] >> 16 | append0[2] << 16; - w2[1] = append0[2] >> 16 | append0[3] << 16; - w2[2] = append0[3] >> 16 | append1[0] << 16; - w2[3] = append1[0] >> 16 | append1[1] << 16; - w3[0] = append1[1] >> 16 | append1[2] << 16; - w3[1] = append1[2] >> 16 | append1[3] << 16; - break; - - case 27: - w1[2] = w1[2] | append0[0] << 24; - w1[3] = append0[0] >> 8 | append0[1] << 24; - w2[0] = append0[1] >> 8 | append0[2] << 24; - w2[1] = append0[2] >> 8 | append0[3] << 24; - w2[2] = append0[3] >> 8 | append1[0] << 24; - w2[3] = append1[0] >> 8 | append1[1] << 24; - w3[0] = append1[1] >> 8 | append1[2] << 24; - w3[1] = append1[2] >> 8 | append1[3] << 24; - break; - - case 28: - w1[3] = append0[0]; - w2[0] = append0[1]; - w2[1] = append0[2]; - w2[2] = append0[3]; - w2[3] = append1[0]; - w3[0] = append1[1]; - w3[1] = append1[2]; - break; - - case 29: - w1[3] = w1[3] | append0[0] << 8; - w2[0] = append0[0] >> 24 | append0[1] << 8; - w2[1] = append0[1] >> 24 | append0[2] << 8; - w2[2] = append0[2] >> 24 | append0[3] << 8; - w2[3] = append0[3] >> 24 | append1[0] << 8; - w3[0] = append1[0] >> 24 | append1[1] << 8; - w3[1] = append1[1] >> 24 | append1[2] << 8; - break; - - case 30: - w1[3] = w1[3] | append0[0] << 16; - w2[0] = append0[0] >> 16 | append0[1] << 16; - w2[1] = append0[1] >> 16 | append0[2] << 16; - w2[2] = append0[2] >> 16 | append0[3] << 16; - w2[3] = append0[3] >> 16 | append1[0] << 16; - w3[0] = append1[0] >> 16 | append1[1] << 16; - w3[1] = append1[1] >> 16 | append1[2] << 16; - break; - - case 31: - w1[3] = w1[3] | append0[0] << 24; - w2[0] = append0[0] >> 8 | append0[1] << 24; - w2[1] = append0[1] >> 8 | append0[2] << 24; - w2[2] = append0[2] >> 8 | append0[3] << 24; - w2[3] = append0[3] >> 8 | append1[0] << 24; - w3[0] = append1[0] >> 8 | append1[1] << 24; - w3[1] = append1[1] >> 8 | append1[2] << 24; - break; - - case 32: - w2[0] = append0[0]; - w2[1] = append0[1]; - w2[2] = append0[2]; - w2[3] = append0[3]; - w3[0] = append1[0]; - w3[1] = append1[1]; + w3[1] = __byte_perm_S (w0[0], 0, selector); + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; break; } + #endif } - -*/ diff --git a/OpenCL/m00000_a1.cl b/OpenCL/m00000_a1.cl index eb398a112..3384966fd 100644 --- a/OpenCL/m00000_a1.cl +++ b/OpenCL/m00000_a1.cl @@ -70,7 +70,7 @@ __kernel void m00000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -113,7 +113,7 @@ __kernel void m00000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -286,7 +286,7 @@ __kernel void m00000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -341,7 +341,7 @@ __kernel void m00000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m00000_a3.cl b/OpenCL/m00000_a3.cl index 8995ee915..cedf59294 100644 --- a/OpenCL/m00000_a3.cl +++ b/OpenCL/m00000_a3.cl @@ -23,7 +23,7 @@ #define MD5_STEP_REV(f,a,b,c,d,x,t,s) \ { \ a -= b; \ - a = rotr32 (a, s); \ + a = rotr32_S (a, s); \ a -= f (b, c, d); \ a -= x; \ a -= t; \ @@ -32,7 +32,7 @@ #define MD5_STEP_REV1(f,a,b,c,d,x,t,s) \ { \ a -= b; \ - a = rotr32 (a, s); \ + a = rotr32_S (a, s); \ a -= x; \ a -= t; \ } @@ -306,32 +306,32 @@ static void m00000s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k * reverse */ - u32x a_rev = digests_buf[digests_offset].digest_buf[0]; - u32x b_rev = digests_buf[digests_offset].digest_buf[1]; - u32x c_rev = digests_buf[digests_offset].digest_buf[2]; - u32x d_rev = digests_buf[digests_offset].digest_buf[3]; + u32 a_rev = digests_buf[digests_offset].digest_buf[0]; + u32 b_rev = digests_buf[digests_offset].digest_buf[1]; + u32 c_rev = digests_buf[digests_offset].digest_buf[2]; + u32 d_rev = digests_buf[digests_offset].digest_buf[3]; - MD5_STEP_REV (MD5_I, b_rev, c_rev, d_rev, a_rev, w[ 9], MD5C3f, MD5S33); - MD5_STEP_REV (MD5_I, c_rev, d_rev, a_rev, b_rev, w[ 2], MD5C3e, MD5S32); - MD5_STEP_REV (MD5_I, d_rev, a_rev, b_rev, c_rev, w[11], MD5C3d, MD5S31); - MD5_STEP_REV (MD5_I, a_rev, b_rev, c_rev, d_rev, w[ 4], MD5C3c, MD5S30); - MD5_STEP_REV (MD5_I, b_rev, c_rev, d_rev, a_rev, w[13], MD5C3b, MD5S33); - MD5_STEP_REV (MD5_I, c_rev, d_rev, a_rev, b_rev, w[ 6], MD5C3a, MD5S32); - MD5_STEP_REV (MD5_I, d_rev, a_rev, b_rev, c_rev, w[15], MD5C39, MD5S31); - MD5_STEP_REV (MD5_I, a_rev, b_rev, c_rev, d_rev, w[ 8], MD5C38, MD5S30); - MD5_STEP_REV (MD5_I, b_rev, c_rev, d_rev, a_rev, w[ 1], MD5C37, MD5S33); - MD5_STEP_REV (MD5_I, c_rev, d_rev, a_rev, b_rev, w[10], MD5C36, MD5S32); - MD5_STEP_REV (MD5_I, d_rev, a_rev, b_rev, c_rev, w[ 3], MD5C35, MD5S31); - MD5_STEP_REV (MD5_I, a_rev, b_rev, c_rev, d_rev, w[12], MD5C34, MD5S30); - MD5_STEP_REV (MD5_I, b_rev, c_rev, d_rev, a_rev, w[ 5], MD5C33, MD5S33); - MD5_STEP_REV (MD5_I, c_rev, d_rev, a_rev, b_rev, w[14], MD5C32, MD5S32); - MD5_STEP_REV (MD5_I, d_rev, a_rev, b_rev, c_rev, w[ 7], MD5C31, MD5S31); - MD5_STEP_REV (MD5_I, a_rev, b_rev, c_rev, d_rev, 0, MD5C30, MD5S30); + MD5_STEP_REV (MD5_I_S, b_rev, c_rev, d_rev, a_rev, w[ 9], MD5C3f, MD5S33); + MD5_STEP_REV (MD5_I_S, c_rev, d_rev, a_rev, b_rev, w[ 2], MD5C3e, MD5S32); + MD5_STEP_REV (MD5_I_S, d_rev, a_rev, b_rev, c_rev, w[11], MD5C3d, MD5S31); + MD5_STEP_REV (MD5_I_S, a_rev, b_rev, c_rev, d_rev, w[ 4], MD5C3c, MD5S30); + MD5_STEP_REV (MD5_I_S, b_rev, c_rev, d_rev, a_rev, w[13], MD5C3b, MD5S33); + MD5_STEP_REV (MD5_I_S, c_rev, d_rev, a_rev, b_rev, w[ 6], MD5C3a, MD5S32); + MD5_STEP_REV (MD5_I_S, d_rev, a_rev, b_rev, c_rev, w[15], MD5C39, MD5S31); + MD5_STEP_REV (MD5_I_S, a_rev, b_rev, c_rev, d_rev, w[ 8], MD5C38, MD5S30); + MD5_STEP_REV (MD5_I_S, b_rev, c_rev, d_rev, a_rev, w[ 1], MD5C37, MD5S33); + MD5_STEP_REV (MD5_I_S, c_rev, d_rev, a_rev, b_rev, w[10], MD5C36, MD5S32); + MD5_STEP_REV (MD5_I_S, d_rev, a_rev, b_rev, c_rev, w[ 3], MD5C35, MD5S31); + MD5_STEP_REV (MD5_I_S, a_rev, b_rev, c_rev, d_rev, w[12], MD5C34, MD5S30); + MD5_STEP_REV (MD5_I_S, b_rev, c_rev, d_rev, a_rev, w[ 5], MD5C33, MD5S33); + MD5_STEP_REV (MD5_I_S, c_rev, d_rev, a_rev, b_rev, w[14], MD5C32, MD5S32); + MD5_STEP_REV (MD5_I_S, d_rev, a_rev, b_rev, c_rev, w[ 7], MD5C31, MD5S31); + MD5_STEP_REV (MD5_I_S, a_rev, b_rev, c_rev, d_rev, 0, MD5C30, MD5S30); - const u32x pre_cd = c_rev ^ d_rev; + const u32 pre_cd = c_rev ^ d_rev; - MD5_STEP_REV1(MD5_H, b_rev, c_rev, d_rev, a_rev, w[ 2], MD5C2f, MD5S23); - MD5_STEP_REV1(MD5_H, c_rev, d_rev, a_rev, b_rev, w[15], MD5C2e, MD5S22); + MD5_STEP_REV1(MD5_H_S, b_rev, c_rev, d_rev, a_rev, w[ 2], MD5C2f, MD5S23); + MD5_STEP_REV1(MD5_H_S, c_rev, d_rev, a_rev, b_rev, w[15], MD5C2e, MD5S22); /** * loop diff --git a/OpenCL/m00010_a0.cl b/OpenCL/m00010_a0.cl index 86b18e18c..47c478d1c 100644 --- a/OpenCL/m00010_a0.cl +++ b/OpenCL/m00010_a0.cl @@ -142,7 +142,7 @@ __kernel void m00010_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, out_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, out_len); const u32 pw_salt_len = out_len + salt_len; @@ -397,7 +397,7 @@ __kernel void m00010_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, out_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, out_len); const u32 pw_salt_len = out_len + salt_len; diff --git a/OpenCL/m00010_a1.cl b/OpenCL/m00010_a1.cl index d74c4e809..a5b7fc19e 100644 --- a/OpenCL/m00010_a1.cl +++ b/OpenCL/m00010_a1.cl @@ -68,7 +68,7 @@ __kernel void m00010_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -131,7 +131,7 @@ __kernel void m00010_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } /** @@ -166,7 +166,7 @@ __kernel void m00010_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, pw_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len); const u32 pw_salt_len = pw_len + salt_len; @@ -345,7 +345,7 @@ __kernel void m00010_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -420,7 +420,7 @@ __kernel void m00010_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } /** @@ -455,7 +455,7 @@ __kernel void m00010_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, pw_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len); const u32 pw_salt_len = pw_len + salt_len; diff --git a/OpenCL/m00010_a3.cl b/OpenCL/m00010_a3.cl index 8a31f3c93..677fc5ac3 100644 --- a/OpenCL/m00010_a3.cl +++ b/OpenCL/m00010_a3.cl @@ -23,7 +23,7 @@ #define MD5_STEP_REV(f,a,b,c,d,x,t,s) \ { \ a -= b; \ - a = rotr32 (a, s); \ + a = rotr32_S (a, s); \ a -= f (b, c, d); \ a -= x; \ a -= t; \ @@ -32,7 +32,7 @@ #define MD5_STEP_REV1(f,a,b,c,d,x,t,s) \ { \ a -= b; \ - a = rotr32 (a, s); \ + a = rotr32_S (a, s); \ a -= x; \ a -= t; \ } @@ -78,7 +78,7 @@ static void m00010m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k salt_buf3[2] = 0; salt_buf3[3] = 0; - switch_buffer_by_offset_S (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len); + switch_buffer_by_offset_le_S (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len); w[ 0] |= salt_buf0[0]; w[ 1] |= salt_buf0[1]; @@ -363,32 +363,32 @@ static void m00010s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k * reverse */ - u32x a_rev = digests_buf[digests_offset].digest_buf[0]; - u32x b_rev = digests_buf[digests_offset].digest_buf[1]; - u32x c_rev = digests_buf[digests_offset].digest_buf[2]; - u32x d_rev = digests_buf[digests_offset].digest_buf[3]; + u32 a_rev = digests_buf[digests_offset].digest_buf[0]; + u32 b_rev = digests_buf[digests_offset].digest_buf[1]; + u32 c_rev = digests_buf[digests_offset].digest_buf[2]; + u32 d_rev = digests_buf[digests_offset].digest_buf[3]; - MD5_STEP_REV (MD5_I, b_rev, c_rev, d_rev, a_rev, w[ 9], MD5C3f, MD5S33); - MD5_STEP_REV (MD5_I, c_rev, d_rev, a_rev, b_rev, w[ 2], MD5C3e, MD5S32); - MD5_STEP_REV (MD5_I, d_rev, a_rev, b_rev, c_rev, w[11], MD5C3d, MD5S31); - MD5_STEP_REV (MD5_I, a_rev, b_rev, c_rev, d_rev, w[ 4], MD5C3c, MD5S30); - MD5_STEP_REV (MD5_I, b_rev, c_rev, d_rev, a_rev, w[13], MD5C3b, MD5S33); - MD5_STEP_REV (MD5_I, c_rev, d_rev, a_rev, b_rev, w[ 6], MD5C3a, MD5S32); - MD5_STEP_REV (MD5_I, d_rev, a_rev, b_rev, c_rev, w[15], MD5C39, MD5S31); - MD5_STEP_REV (MD5_I, a_rev, b_rev, c_rev, d_rev, w[ 8], MD5C38, MD5S30); - MD5_STEP_REV (MD5_I, b_rev, c_rev, d_rev, a_rev, w[ 1], MD5C37, MD5S33); - MD5_STEP_REV (MD5_I, c_rev, d_rev, a_rev, b_rev, w[10], MD5C36, MD5S32); - MD5_STEP_REV (MD5_I, d_rev, a_rev, b_rev, c_rev, w[ 3], MD5C35, MD5S31); - MD5_STEP_REV (MD5_I, a_rev, b_rev, c_rev, d_rev, w[12], MD5C34, MD5S30); - MD5_STEP_REV (MD5_I, b_rev, c_rev, d_rev, a_rev, w[ 5], MD5C33, MD5S33); - MD5_STEP_REV (MD5_I, c_rev, d_rev, a_rev, b_rev, w[14], MD5C32, MD5S32); - MD5_STEP_REV (MD5_I, d_rev, a_rev, b_rev, c_rev, w[ 7], MD5C31, MD5S31); - MD5_STEP_REV (MD5_I, a_rev, b_rev, c_rev, d_rev, 0, MD5C30, MD5S30); + MD5_STEP_REV (MD5_I_S, b_rev, c_rev, d_rev, a_rev, w[ 9], MD5C3f, MD5S33); + MD5_STEP_REV (MD5_I_S, c_rev, d_rev, a_rev, b_rev, w[ 2], MD5C3e, MD5S32); + MD5_STEP_REV (MD5_I_S, d_rev, a_rev, b_rev, c_rev, w[11], MD5C3d, MD5S31); + MD5_STEP_REV (MD5_I_S, a_rev, b_rev, c_rev, d_rev, w[ 4], MD5C3c, MD5S30); + MD5_STEP_REV (MD5_I_S, b_rev, c_rev, d_rev, a_rev, w[13], MD5C3b, MD5S33); + MD5_STEP_REV (MD5_I_S, c_rev, d_rev, a_rev, b_rev, w[ 6], MD5C3a, MD5S32); + MD5_STEP_REV (MD5_I_S, d_rev, a_rev, b_rev, c_rev, w[15], MD5C39, MD5S31); + MD5_STEP_REV (MD5_I_S, a_rev, b_rev, c_rev, d_rev, w[ 8], MD5C38, MD5S30); + MD5_STEP_REV (MD5_I_S, b_rev, c_rev, d_rev, a_rev, w[ 1], MD5C37, MD5S33); + MD5_STEP_REV (MD5_I_S, c_rev, d_rev, a_rev, b_rev, w[10], MD5C36, MD5S32); + MD5_STEP_REV (MD5_I_S, d_rev, a_rev, b_rev, c_rev, w[ 3], MD5C35, MD5S31); + MD5_STEP_REV (MD5_I_S, a_rev, b_rev, c_rev, d_rev, w[12], MD5C34, MD5S30); + MD5_STEP_REV (MD5_I_S, b_rev, c_rev, d_rev, a_rev, w[ 5], MD5C33, MD5S33); + MD5_STEP_REV (MD5_I_S, c_rev, d_rev, a_rev, b_rev, w[14], MD5C32, MD5S32); + MD5_STEP_REV (MD5_I_S, d_rev, a_rev, b_rev, c_rev, w[ 7], MD5C31, MD5S31); + MD5_STEP_REV (MD5_I_S, a_rev, b_rev, c_rev, d_rev, 0, MD5C30, MD5S30); - const u32x pre_cd = c_rev ^ d_rev; + const u32 pre_cd = c_rev ^ d_rev; - MD5_STEP_REV1(MD5_H, b_rev, c_rev, d_rev, a_rev, w[ 2], MD5C2f, MD5S23); - MD5_STEP_REV1(MD5_H, c_rev, d_rev, a_rev, b_rev, w[15], MD5C2e, MD5S22); + MD5_STEP_REV1(MD5_H_S, b_rev, c_rev, d_rev, a_rev, w[ 2], MD5C2f, MD5S23); + MD5_STEP_REV1(MD5_H_S, c_rev, d_rev, a_rev, b_rev, w[15], MD5C2e, MD5S22); /** * loop diff --git a/OpenCL/m00020_a0.cl b/OpenCL/m00020_a0.cl index 471526aaf..eba8586da 100644 --- a/OpenCL/m00020_a0.cl +++ b/OpenCL/m00020_a0.cl @@ -132,7 +132,7 @@ __kernel void m00020_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w3_t[2] = w3[2]; w3_t[3] = w3[3]; - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w0_t[0] |= salt_buf0[0]; w0_t[1] |= salt_buf0[1]; @@ -366,7 +366,7 @@ __kernel void m00020_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w3_t[2] = w3[2]; w3_t[3] = w3[3]; - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w0_t[0] |= salt_buf0[0]; w0_t[1] |= salt_buf0[1]; diff --git a/OpenCL/m00020_a1.cl b/OpenCL/m00020_a1.cl index f4848ab86..64c50e5d8 100644 --- a/OpenCL/m00020_a1.cl +++ b/OpenCL/m00020_a1.cl @@ -68,7 +68,7 @@ __kernel void m00020_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -125,7 +125,7 @@ __kernel void m00020_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -178,7 +178,7 @@ __kernel void m00020_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w3_t[2] = w3[2]; w3_t[3] = w3[3]; - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w0_t[0] |= salt_buf0[0]; w0_t[1] |= salt_buf0[1]; @@ -338,7 +338,7 @@ __kernel void m00020_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -407,7 +407,7 @@ __kernel void m00020_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -460,7 +460,7 @@ __kernel void m00020_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w3_t[2] = w3[2]; w3_t[3] = w3[3]; - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w3_t[2] = pw_salt_len * 8; diff --git a/OpenCL/m00020_a3.cl b/OpenCL/m00020_a3.cl index ed59626bd..0efc98a98 100644 --- a/OpenCL/m00020_a3.cl +++ b/OpenCL/m00020_a3.cl @@ -65,6 +65,51 @@ static void m00020m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le const u32 pw_salt_len = pw_len + salt_len; + /** + * prepend salt + */ + + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; + + w0_t[0] = w0[0]; + w0_t[1] = w0[1]; + w0_t[2] = w0[2]; + w0_t[3] = w0[3]; + w1_t[0] = w1[0]; + w1_t[1] = w1[1]; + w1_t[2] = w1[2]; + w1_t[3] = w1[3]; + w2_t[0] = w2[0]; + w2_t[1] = w2[1]; + w2_t[2] = w2[2]; + w2_t[3] = w2[3]; + w3_t[0] = w3[0]; + w3_t[1] = w3[1]; + w3_t[2] = w3[2]; + w3_t[3] = w3[3]; + + switch_buffer_by_offset_le_S (w0_t, w1_t, w2_t, w3_t, salt_len); + + w0_t[0] |= salt_buf0[0]; + w0_t[1] |= salt_buf0[1]; + w0_t[2] |= salt_buf0[2]; + w0_t[3] |= salt_buf0[3]; + w1_t[0] |= salt_buf1[0]; + w1_t[1] |= salt_buf1[1]; + w1_t[2] |= salt_buf1[2]; + w1_t[3] |= salt_buf1[3]; + w2_t[0] |= salt_buf2[0]; + w2_t[1] |= salt_buf2[1]; + w2_t[2] |= salt_buf2[2]; + w2_t[3] |= salt_buf2[3]; + w3_t[0] |= salt_buf3[0]; + w3_t[1] |= salt_buf3[1]; + w3_t[2] |= salt_buf3[2]; + w3_t[3] |= salt_buf3[3]; + /** * loop */ @@ -73,64 +118,52 @@ static void m00020m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - #if VECT_SIZE == 1 - const u32x w0r = (u32x) (bfs_buf[il_pos + 0].i); - #elif VECT_SIZE == 2 - const u32x w0r = (u32x) (bfs_buf[il_pos + 0].i, bfs_buf[il_pos + 1].i); - #elif VECT_SIZE == 4 - const u32x w0r = (u32x) (bfs_buf[il_pos + 0].i, bfs_buf[il_pos + 1].i, bfs_buf[il_pos + 2].i, bfs_buf[il_pos + 3].i); - #elif VECT_SIZE == 8 - const u32x w0r = (u32x) (bfs_buf[il_pos + 0].i, bfs_buf[il_pos + 1].i, bfs_buf[il_pos + 2].i, bfs_buf[il_pos + 3].i, bfs_buf[il_pos + 4].i, bfs_buf[il_pos + 5].i, bfs_buf[il_pos + 6].i, bfs_buf[il_pos + 7].i); - #endif + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); const u32x w0lr = w0l | w0r; - /** - * prepend salt - */ + u32x wx[16]; + + wx[ 0] = w0_t[0]; + wx[ 1] = w0_t[1]; + wx[ 2] = w0_t[2]; + wx[ 3] = w0_t[3]; + wx[ 4] = w1_t[0]; + wx[ 5] = w1_t[1]; + wx[ 6] = w1_t[2]; + wx[ 7] = w1_t[3]; + wx[ 8] = w2_t[0]; + wx[ 9] = w2_t[1]; + wx[10] = w2_t[2]; + wx[11] = w2_t[3]; + wx[12] = w3_t[0]; + wx[13] = w3_t[1]; + wx[14] = w3_t[2]; + wx[15] = w3_t[3]; + + overwrite_at_le (wx, w0lr, salt_len); u32x w0_t[4]; u32x w1_t[4]; u32x w2_t[4]; u32x w3_t[4]; - w0_t[0] = w0lr; - w0_t[1] = w0[1]; - w0_t[2] = w0[2]; - w0_t[3] = w0[3]; - w1_t[0] = w1[0]; - w1_t[1] = w1[1]; - w1_t[2] = w1[2]; - w1_t[3] = w1[3]; - w2_t[0] = w2[0]; - w2_t[1] = w2[1]; - w2_t[2] = w2[2]; - w2_t[3] = w2[3]; - w3_t[0] = w3[0]; - w3_t[1] = w3[1]; - w3_t[2] = w3[2]; - w3_t[3] = w3[3]; - - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); - + w0_t[0] = wx[ 0]; + w0_t[1] = wx[ 1]; + w0_t[2] = wx[ 2]; + w0_t[3] = wx[ 3]; + w1_t[0] = wx[ 4]; + w1_t[1] = wx[ 5]; + w1_t[2] = wx[ 6]; + w1_t[3] = wx[ 7]; + w2_t[0] = wx[ 8]; + w2_t[1] = wx[ 9]; + w2_t[2] = wx[10]; + w2_t[3] = wx[11]; + w3_t[0] = wx[12]; + w3_t[1] = wx[13]; w3_t[2] = pw_salt_len * 8; - - w0_t[0] |= salt_buf0[0]; - w0_t[1] |= salt_buf0[1]; - w0_t[2] |= salt_buf0[2]; - w0_t[3] |= salt_buf0[3]; - w1_t[0] |= salt_buf1[0]; - w1_t[1] |= salt_buf1[1]; - w1_t[2] |= salt_buf1[2]; - w1_t[3] |= salt_buf1[3]; - w2_t[0] |= salt_buf2[0]; - w2_t[1] |= salt_buf2[1]; - w2_t[2] |= salt_buf2[2]; - w2_t[3] |= salt_buf2[3]; - w3_t[0] |= salt_buf3[0]; - w3_t[1] |= salt_buf3[1]; - w3_t[2] |= salt_buf3[2]; - w3_t[3] |= salt_buf3[3]; + w3_t[3] = 0; /** * md5 @@ -272,6 +305,51 @@ static void m00020s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le const u32 pw_salt_len = pw_len + salt_len; + /** + * prepend salt + */ + + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; + + w0_t[0] = w0[0]; + w0_t[1] = w0[1]; + w0_t[2] = w0[2]; + w0_t[3] = w0[3]; + w1_t[0] = w1[0]; + w1_t[1] = w1[1]; + w1_t[2] = w1[2]; + w1_t[3] = w1[3]; + w2_t[0] = w2[0]; + w2_t[1] = w2[1]; + w2_t[2] = w2[2]; + w2_t[3] = w2[3]; + w3_t[0] = w3[0]; + w3_t[1] = w3[1]; + w3_t[2] = w3[2]; + w3_t[3] = w3[3]; + + switch_buffer_by_offset_le_S (w0_t, w1_t, w2_t, w3_t, salt_len); + + w0_t[0] |= salt_buf0[0]; + w0_t[1] |= salt_buf0[1]; + w0_t[2] |= salt_buf0[2]; + w0_t[3] |= salt_buf0[3]; + w1_t[0] |= salt_buf1[0]; + w1_t[1] |= salt_buf1[1]; + w1_t[2] |= salt_buf1[2]; + w1_t[3] |= salt_buf1[3]; + w2_t[0] |= salt_buf2[0]; + w2_t[1] |= salt_buf2[1]; + w2_t[2] |= salt_buf2[2]; + w2_t[3] |= salt_buf2[3]; + w3_t[0] |= salt_buf3[0]; + w3_t[1] |= salt_buf3[1]; + w3_t[2] |= salt_buf3[2]; + w3_t[3] |= salt_buf3[3]; + /** * loop */ @@ -280,64 +358,52 @@ static void m00020s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - #if VECT_SIZE == 1 - const u32x w0r = (u32x) (bfs_buf[il_pos + 0].i); - #elif VECT_SIZE == 2 - const u32x w0r = (u32x) (bfs_buf[il_pos + 0].i, bfs_buf[il_pos + 1].i); - #elif VECT_SIZE == 4 - const u32x w0r = (u32x) (bfs_buf[il_pos + 0].i, bfs_buf[il_pos + 1].i, bfs_buf[il_pos + 2].i, bfs_buf[il_pos + 3].i); - #elif VECT_SIZE == 8 - const u32x w0r = (u32x) (bfs_buf[il_pos + 0].i, bfs_buf[il_pos + 1].i, bfs_buf[il_pos + 2].i, bfs_buf[il_pos + 3].i, bfs_buf[il_pos + 4].i, bfs_buf[il_pos + 5].i, bfs_buf[il_pos + 6].i, bfs_buf[il_pos + 7].i); - #endif + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); const u32x w0lr = w0l | w0r; - /** - * prepend salt - */ + u32x wx[16]; + + wx[ 0] = w0_t[0]; + wx[ 1] = w0_t[1]; + wx[ 2] = w0_t[2]; + wx[ 3] = w0_t[3]; + wx[ 4] = w1_t[0]; + wx[ 5] = w1_t[1]; + wx[ 6] = w1_t[2]; + wx[ 7] = w1_t[3]; + wx[ 8] = w2_t[0]; + wx[ 9] = w2_t[1]; + wx[10] = w2_t[2]; + wx[11] = w2_t[3]; + wx[12] = w3_t[0]; + wx[13] = w3_t[1]; + wx[14] = w3_t[2]; + wx[15] = w3_t[3]; + + overwrite_at_le (wx, w0lr, salt_len); u32x w0_t[4]; u32x w1_t[4]; u32x w2_t[4]; u32x w3_t[4]; - w0_t[0] = w0lr; - w0_t[1] = w0[1]; - w0_t[2] = w0[2]; - w0_t[3] = w0[3]; - w1_t[0] = w1[0]; - w1_t[1] = w1[1]; - w1_t[2] = w1[2]; - w1_t[3] = w1[3]; - w2_t[0] = w2[0]; - w2_t[1] = w2[1]; - w2_t[2] = w2[2]; - w2_t[3] = w2[3]; - w3_t[0] = w3[0]; - w3_t[1] = w3[1]; - w3_t[2] = w3[2]; - w3_t[3] = w3[3]; - - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); - + w0_t[0] = wx[ 0]; + w0_t[1] = wx[ 1]; + w0_t[2] = wx[ 2]; + w0_t[3] = wx[ 3]; + w1_t[0] = wx[ 4]; + w1_t[1] = wx[ 5]; + w1_t[2] = wx[ 6]; + w1_t[3] = wx[ 7]; + w2_t[0] = wx[ 8]; + w2_t[1] = wx[ 9]; + w2_t[2] = wx[10]; + w2_t[3] = wx[11]; + w3_t[0] = wx[12]; + w3_t[1] = wx[13]; w3_t[2] = pw_salt_len * 8; - - w0_t[0] |= salt_buf0[0]; - w0_t[1] |= salt_buf0[1]; - w0_t[2] |= salt_buf0[2]; - w0_t[3] |= salt_buf0[3]; - w1_t[0] |= salt_buf1[0]; - w1_t[1] |= salt_buf1[1]; - w1_t[2] |= salt_buf1[2]; - w1_t[3] |= salt_buf1[3]; - w2_t[0] |= salt_buf2[0]; - w2_t[1] |= salt_buf2[1]; - w2_t[2] |= salt_buf2[2]; - w2_t[3] |= salt_buf2[3]; - w3_t[0] |= salt_buf3[0]; - w3_t[1] |= salt_buf3[1]; - w3_t[2] |= salt_buf3[2]; - w3_t[3] |= salt_buf3[3]; + w3_t[3] = 0; /** * md5 diff --git a/OpenCL/m00030_a0.cl b/OpenCL/m00030_a0.cl index ec09fa38e..d2f314968 100644 --- a/OpenCL/m00030_a0.cl +++ b/OpenCL/m00030_a0.cl @@ -144,7 +144,7 @@ __kernel void m00030_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, (out_len * 2)); + switch_buffer_by_offset_le (s0, s1, s2, s3, (out_len * 2)); const u32 out_salt_len = (out_len * 2) + salt_len; @@ -404,7 +404,7 @@ __kernel void m00030_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, (out_len * 2)); + switch_buffer_by_offset_le (s0, s1, s2, s3, (out_len * 2)); const u32 out_salt_len = (out_len * 2) + salt_len; diff --git a/OpenCL/m00030_a1.cl b/OpenCL/m00030_a1.cl index c7bb6973f..3b8b68065 100644 --- a/OpenCL/m00030_a1.cl +++ b/OpenCL/m00030_a1.cl @@ -70,7 +70,7 @@ __kernel void m00030_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -133,7 +133,7 @@ __kernel void m00030_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } /** @@ -168,7 +168,7 @@ __kernel void m00030_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, (pw_len * 2)); + switch_buffer_by_offset_le (s0, s1, s2, s3, (pw_len * 2)); const u32 pw_salt_len = (pw_len * 2) + salt_len; @@ -366,7 +366,7 @@ __kernel void m00030_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -441,7 +441,7 @@ __kernel void m00030_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } /** @@ -476,7 +476,7 @@ __kernel void m00030_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, (pw_len * 2)); + switch_buffer_by_offset_le (s0, s1, s2, s3, (pw_len * 2)); const u32 pw_salt_len = (pw_len * 2) + salt_len; diff --git a/OpenCL/m00030_a3.cl b/OpenCL/m00030_a3.cl index 201b2fadf..64b03630b 100644 --- a/OpenCL/m00030_a3.cl +++ b/OpenCL/m00030_a3.cl @@ -23,7 +23,7 @@ #define MD5_STEP_REV(f,a,b,c,d,x,t,s) \ { \ a -= b; \ - a = rotr32 (a, s); \ + a = rotr32_S (a, s); \ a -= f (b, c, d); \ a -= x; \ a -= t; \ @@ -32,7 +32,7 @@ #define MD5_STEP_REV1(f,a,b,c,d,x,t,s) \ { \ a -= b; \ - a = rotr32 (a, s); \ + a = rotr32_S (a, s); \ a -= x; \ a -= t; \ } @@ -78,7 +78,7 @@ static void m00030m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k salt_buf3[2] = 0; salt_buf3[3] = 0; - switch_buffer_by_offset_S (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len); + switch_buffer_by_offset_le_S (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len); w[ 0] |= salt_buf0[0]; w[ 1] |= salt_buf0[1]; @@ -363,32 +363,32 @@ static void m00030s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k * reverse */ - u32x a_rev = digests_buf[digests_offset].digest_buf[0]; - u32x b_rev = digests_buf[digests_offset].digest_buf[1]; - u32x c_rev = digests_buf[digests_offset].digest_buf[2]; - u32x d_rev = digests_buf[digests_offset].digest_buf[3]; + u32 a_rev = digests_buf[digests_offset].digest_buf[0]; + u32 b_rev = digests_buf[digests_offset].digest_buf[1]; + u32 c_rev = digests_buf[digests_offset].digest_buf[2]; + u32 d_rev = digests_buf[digests_offset].digest_buf[3]; - MD5_STEP_REV (MD5_I, b_rev, c_rev, d_rev, a_rev, w[ 9], MD5C3f, MD5S33); - MD5_STEP_REV (MD5_I, c_rev, d_rev, a_rev, b_rev, w[ 2], MD5C3e, MD5S32); - MD5_STEP_REV (MD5_I, d_rev, a_rev, b_rev, c_rev, w[11], MD5C3d, MD5S31); - MD5_STEP_REV (MD5_I, a_rev, b_rev, c_rev, d_rev, w[ 4], MD5C3c, MD5S30); - MD5_STEP_REV (MD5_I, b_rev, c_rev, d_rev, a_rev, w[13], MD5C3b, MD5S33); - MD5_STEP_REV (MD5_I, c_rev, d_rev, a_rev, b_rev, w[ 6], MD5C3a, MD5S32); - MD5_STEP_REV (MD5_I, d_rev, a_rev, b_rev, c_rev, w[15], MD5C39, MD5S31); - MD5_STEP_REV (MD5_I, a_rev, b_rev, c_rev, d_rev, w[ 8], MD5C38, MD5S30); - MD5_STEP_REV (MD5_I, b_rev, c_rev, d_rev, a_rev, w[ 1], MD5C37, MD5S33); - MD5_STEP_REV (MD5_I, c_rev, d_rev, a_rev, b_rev, w[10], MD5C36, MD5S32); - MD5_STEP_REV (MD5_I, d_rev, a_rev, b_rev, c_rev, w[ 3], MD5C35, MD5S31); - MD5_STEP_REV (MD5_I, a_rev, b_rev, c_rev, d_rev, w[12], MD5C34, MD5S30); - MD5_STEP_REV (MD5_I, b_rev, c_rev, d_rev, a_rev, w[ 5], MD5C33, MD5S33); - MD5_STEP_REV (MD5_I, c_rev, d_rev, a_rev, b_rev, w[14], MD5C32, MD5S32); - MD5_STEP_REV (MD5_I, d_rev, a_rev, b_rev, c_rev, w[ 7], MD5C31, MD5S31); - MD5_STEP_REV (MD5_I, a_rev, b_rev, c_rev, d_rev, 0, MD5C30, MD5S30); + MD5_STEP_REV (MD5_I_S, b_rev, c_rev, d_rev, a_rev, w[ 9], MD5C3f, MD5S33); + MD5_STEP_REV (MD5_I_S, c_rev, d_rev, a_rev, b_rev, w[ 2], MD5C3e, MD5S32); + MD5_STEP_REV (MD5_I_S, d_rev, a_rev, b_rev, c_rev, w[11], MD5C3d, MD5S31); + MD5_STEP_REV (MD5_I_S, a_rev, b_rev, c_rev, d_rev, w[ 4], MD5C3c, MD5S30); + MD5_STEP_REV (MD5_I_S, b_rev, c_rev, d_rev, a_rev, w[13], MD5C3b, MD5S33); + MD5_STEP_REV (MD5_I_S, c_rev, d_rev, a_rev, b_rev, w[ 6], MD5C3a, MD5S32); + MD5_STEP_REV (MD5_I_S, d_rev, a_rev, b_rev, c_rev, w[15], MD5C39, MD5S31); + MD5_STEP_REV (MD5_I_S, a_rev, b_rev, c_rev, d_rev, w[ 8], MD5C38, MD5S30); + MD5_STEP_REV (MD5_I_S, b_rev, c_rev, d_rev, a_rev, w[ 1], MD5C37, MD5S33); + MD5_STEP_REV (MD5_I_S, c_rev, d_rev, a_rev, b_rev, w[10], MD5C36, MD5S32); + MD5_STEP_REV (MD5_I_S, d_rev, a_rev, b_rev, c_rev, w[ 3], MD5C35, MD5S31); + MD5_STEP_REV (MD5_I_S, a_rev, b_rev, c_rev, d_rev, w[12], MD5C34, MD5S30); + MD5_STEP_REV (MD5_I_S, b_rev, c_rev, d_rev, a_rev, w[ 5], MD5C33, MD5S33); + MD5_STEP_REV (MD5_I_S, c_rev, d_rev, a_rev, b_rev, w[14], MD5C32, MD5S32); + MD5_STEP_REV (MD5_I_S, d_rev, a_rev, b_rev, c_rev, w[ 7], MD5C31, MD5S31); + MD5_STEP_REV (MD5_I_S, a_rev, b_rev, c_rev, d_rev, 0, MD5C30, MD5S30); - const u32x pre_cd = c_rev ^ d_rev; + const u32 pre_cd = c_rev ^ d_rev; - MD5_STEP_REV1(MD5_H, b_rev, c_rev, d_rev, a_rev, w[ 2], MD5C2f, MD5S23); - MD5_STEP_REV1(MD5_H, c_rev, d_rev, a_rev, b_rev, w[15], MD5C2e, MD5S22); + MD5_STEP_REV1(MD5_H_S, b_rev, c_rev, d_rev, a_rev, w[ 2], MD5C2f, MD5S23); + MD5_STEP_REV1(MD5_H_S, c_rev, d_rev, a_rev, b_rev, w[15], MD5C2e, MD5S22); /** * loop diff --git a/OpenCL/m00040_a0.cl b/OpenCL/m00040_a0.cl index 6b0591a4d..3e148120c 100644 --- a/OpenCL/m00040_a0.cl +++ b/OpenCL/m00040_a0.cl @@ -124,7 +124,7 @@ __kernel void m00040_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, make_unicode (w0, w0_t, w1_t); make_unicode (w1, w2_t, w3_t); - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w0_t[0] |= salt_buf0[0]; w0_t[1] |= salt_buf0[1]; @@ -143,6 +143,8 @@ __kernel void m00040_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * md5 */ + u32 tmp2; + u32 a = MD5M_A; u32 b = MD5M_B; u32 c = MD5M_C; @@ -182,22 +184,22 @@ __kernel void m00040_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); + MD5_STEP (MD5_H1, a, b, c, d, w1_t[1], MD5C20, MD5S20); + MD5_STEP (MD5_H2, d, a, b, c, w2_t[0], MD5C21, MD5S21); + MD5_STEP (MD5_H1, c, d, a, b, w2_t[3], MD5C22, MD5S22); + MD5_STEP (MD5_H2, b, c, d, a, w3_t[2], MD5C23, MD5S23); + MD5_STEP (MD5_H1, a, b, c, d, w0_t[1], MD5C24, MD5S20); + MD5_STEP (MD5_H2, d, a, b, c, w1_t[0], MD5C25, MD5S21); + MD5_STEP (MD5_H1, c, d, a, b, w1_t[3], MD5C26, MD5S22); + MD5_STEP (MD5_H2, b, c, d, a, w2_t[2], MD5C27, MD5S23); + MD5_STEP (MD5_H1, a, b, c, d, w3_t[1], MD5C28, MD5S20); + MD5_STEP (MD5_H2, d, a, b, c, w0_t[0], MD5C29, MD5S21); + MD5_STEP (MD5_H1, c, d, a, b, w0_t[3], MD5C2a, MD5S22); + MD5_STEP (MD5_H2, b, c, d, a, w1_t[2], MD5C2b, MD5S23); + MD5_STEP (MD5_H1, a, b, c, d, w2_t[1], MD5C2c, MD5S20); + MD5_STEP (MD5_H2, d, a, b, c, w3_t[0], MD5C2d, MD5S21); + MD5_STEP (MD5_H1, c, d, a, b, w3_t[3], MD5C2e, MD5S22); + MD5_STEP (MD5_H2, b, c, d, a, w0_t[2], MD5C2f, MD5S23); MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); @@ -348,7 +350,7 @@ __kernel void m00040_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, make_unicode (w0, w0_t, w1_t); make_unicode (w1, w2_t, w3_t); - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w0_t[0] |= salt_buf0[0]; w0_t[1] |= salt_buf0[1]; @@ -367,6 +369,8 @@ __kernel void m00040_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * md5 */ + u32 tmp2; + u32 a = MD5M_A; u32 b = MD5M_B; u32 c = MD5M_C; @@ -406,22 +410,22 @@ __kernel void m00040_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); + MD5_STEP (MD5_H1, a, b, c, d, w1_t[1], MD5C20, MD5S20); + MD5_STEP (MD5_H2, d, a, b, c, w2_t[0], MD5C21, MD5S21); + MD5_STEP (MD5_H1, c, d, a, b, w2_t[3], MD5C22, MD5S22); + MD5_STEP (MD5_H2, b, c, d, a, w3_t[2], MD5C23, MD5S23); + MD5_STEP (MD5_H1, a, b, c, d, w0_t[1], MD5C24, MD5S20); + MD5_STEP (MD5_H2, d, a, b, c, w1_t[0], MD5C25, MD5S21); + MD5_STEP (MD5_H1, c, d, a, b, w1_t[3], MD5C26, MD5S22); + MD5_STEP (MD5_H2, b, c, d, a, w2_t[2], MD5C27, MD5S23); + MD5_STEP (MD5_H1, a, b, c, d, w3_t[1], MD5C28, MD5S20); + MD5_STEP (MD5_H2, d, a, b, c, w0_t[0], MD5C29, MD5S21); + MD5_STEP (MD5_H1, c, d, a, b, w0_t[3], MD5C2a, MD5S22); + MD5_STEP (MD5_H2, b, c, d, a, w1_t[2], MD5C2b, MD5S23); + MD5_STEP (MD5_H1, a, b, c, d, w2_t[1], MD5C2c, MD5S20); + MD5_STEP (MD5_H2, d, a, b, c, w3_t[0], MD5C2d, MD5S21); + MD5_STEP (MD5_H1, c, d, a, b, w3_t[3], MD5C2e, MD5S22); + MD5_STEP (MD5_H2, b, c, d, a, w0_t[2], MD5C2f, MD5S23); MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); diff --git a/OpenCL/m00040_a1.cl b/OpenCL/m00040_a1.cl index 36ec0ad12..c687d1b54 100644 --- a/OpenCL/m00040_a1.cl +++ b/OpenCL/m00040_a1.cl @@ -70,7 +70,7 @@ __kernel void m00040_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -133,7 +133,7 @@ __kernel void m00040_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } /** @@ -172,7 +172,7 @@ __kernel void m00040_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, make_unicode (w0, w0_t, w1_t); make_unicode (w1, w2_t, w3_t); - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w0_t[0] |= salt_buf0[0]; w0_t[1] |= salt_buf0[1]; @@ -191,6 +191,8 @@ __kernel void m00040_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * md5 */ + u32 tmp2; + u32 a = MD5M_A; u32 b = MD5M_B; u32 c = MD5M_C; @@ -230,22 +232,22 @@ __kernel void m00040_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); + MD5_STEP (MD5_H1, a, b, c, d, w1_t[1], MD5C20, MD5S20); + MD5_STEP (MD5_H2, d, a, b, c, w2_t[0], MD5C21, MD5S21); + MD5_STEP (MD5_H1, c, d, a, b, w2_t[3], MD5C22, MD5S22); + MD5_STEP (MD5_H2, b, c, d, a, w3_t[2], MD5C23, MD5S23); + MD5_STEP (MD5_H1, a, b, c, d, w0_t[1], MD5C24, MD5S20); + MD5_STEP (MD5_H2, d, a, b, c, w1_t[0], MD5C25, MD5S21); + MD5_STEP (MD5_H1, c, d, a, b, w1_t[3], MD5C26, MD5S22); + MD5_STEP (MD5_H2, b, c, d, a, w2_t[2], MD5C27, MD5S23); + MD5_STEP (MD5_H1, a, b, c, d, w3_t[1], MD5C28, MD5S20); + MD5_STEP (MD5_H2, d, a, b, c, w0_t[0], MD5C29, MD5S21); + MD5_STEP (MD5_H1, c, d, a, b, w0_t[3], MD5C2a, MD5S22); + MD5_STEP (MD5_H2, b, c, d, a, w1_t[2], MD5C2b, MD5S23); + MD5_STEP (MD5_H1, a, b, c, d, w2_t[1], MD5C2c, MD5S20); + MD5_STEP (MD5_H2, d, a, b, c, w3_t[0], MD5C2d, MD5S21); + MD5_STEP (MD5_H1, c, d, a, b, w3_t[3], MD5C2e, MD5S22); + MD5_STEP (MD5_H2, b, c, d, a, w0_t[2], MD5C2f, MD5S23); MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); @@ -330,7 +332,7 @@ __kernel void m00040_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -405,7 +407,7 @@ __kernel void m00040_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } /** @@ -444,7 +446,7 @@ __kernel void m00040_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, make_unicode (w0, w0_t, w1_t); make_unicode (w1, w2_t, w3_t); - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w0_t[0] |= salt_buf0[0]; w0_t[1] |= salt_buf0[1]; @@ -463,6 +465,8 @@ __kernel void m00040_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * md5 */ + u32 tmp2; + u32 a = MD5M_A; u32 b = MD5M_B; u32 c = MD5M_C; @@ -502,22 +506,22 @@ __kernel void m00040_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); + MD5_STEP (MD5_H1, a, b, c, d, w1_t[1], MD5C20, MD5S20); + MD5_STEP (MD5_H2, d, a, b, c, w2_t[0], MD5C21, MD5S21); + MD5_STEP (MD5_H1, c, d, a, b, w2_t[3], MD5C22, MD5S22); + MD5_STEP (MD5_H2, b, c, d, a, w3_t[2], MD5C23, MD5S23); + MD5_STEP (MD5_H1, a, b, c, d, w0_t[1], MD5C24, MD5S20); + MD5_STEP (MD5_H2, d, a, b, c, w1_t[0], MD5C25, MD5S21); + MD5_STEP (MD5_H1, c, d, a, b, w1_t[3], MD5C26, MD5S22); + MD5_STEP (MD5_H2, b, c, d, a, w2_t[2], MD5C27, MD5S23); + MD5_STEP (MD5_H1, a, b, c, d, w3_t[1], MD5C28, MD5S20); + MD5_STEP (MD5_H2, d, a, b, c, w0_t[0], MD5C29, MD5S21); + MD5_STEP (MD5_H1, c, d, a, b, w0_t[3], MD5C2a, MD5S22); + MD5_STEP (MD5_H2, b, c, d, a, w1_t[2], MD5C2b, MD5S23); + MD5_STEP (MD5_H1, a, b, c, d, w2_t[1], MD5C2c, MD5S20); + MD5_STEP (MD5_H2, d, a, b, c, w3_t[0], MD5C2d, MD5S21); + MD5_STEP (MD5_H1, c, d, a, b, w3_t[3], MD5C2e, MD5S22); + MD5_STEP (MD5_H2, b, c, d, a, w0_t[2], MD5C2f, MD5S23); MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); diff --git a/OpenCL/m00040_a3.cl b/OpenCL/m00040_a3.cl index ab3f96860..937de716b 100644 --- a/OpenCL/m00040_a3.cl +++ b/OpenCL/m00040_a3.cl @@ -5,6 +5,8 @@ #define _MD5_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,9 +18,7 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" - -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" +#include "OpenCL/simd.c" static void m00040m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { @@ -65,73 +65,116 @@ static void m00040m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le const u32 pw_salt_len = pw_len + salt_len; + /** + * prepend salt + */ + + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; + + w0_t[0] = w0[0]; + w0_t[1] = w0[1]; + w0_t[2] = w0[2]; + w0_t[3] = w0[3]; + w1_t[0] = w1[0]; + w1_t[1] = w1[1]; + w1_t[2] = w1[2]; + w1_t[3] = w1[3]; + w2_t[0] = w2[0]; + w2_t[1] = w2[1]; + w2_t[2] = w2[2]; + w2_t[3] = w2[3]; + w3_t[0] = w3[0]; + w3_t[1] = w3[1]; + w3_t[2] = w3[2]; + w3_t[3] = w3[3]; + + switch_buffer_by_offset_le_S (w0_t, w1_t, w2_t, w3_t, salt_len); + + w0_t[0] |= salt_buf0[0]; + w0_t[1] |= salt_buf0[1]; + w0_t[2] |= salt_buf0[2]; + w0_t[3] |= salt_buf0[3]; + w1_t[0] |= salt_buf1[0]; + w1_t[1] |= salt_buf1[1]; + w1_t[2] |= salt_buf1[2]; + w1_t[3] |= salt_buf1[3]; + w2_t[0] |= salt_buf2[0]; + w2_t[1] |= salt_buf2[1]; + w2_t[2] |= salt_buf2[2]; + w2_t[3] |= salt_buf2[3]; + w3_t[0] |= salt_buf3[0]; + w3_t[1] |= salt_buf3[1]; + w3_t[2] |= salt_buf3[2]; + w3_t[3] |= salt_buf3[3]; + /** * loop */ u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - /** - * prepend salt - */ + u32x wx[16]; - u32 w0_t[4]; - u32 w1_t[4]; - u32 w2_t[4]; - u32 w3_t[4]; + wx[ 0] = w0_t[0]; + wx[ 1] = w0_t[1]; + wx[ 2] = w0_t[2]; + wx[ 3] = w0_t[3]; + wx[ 4] = w1_t[0]; + wx[ 5] = w1_t[1]; + wx[ 6] = w1_t[2]; + wx[ 7] = w1_t[3]; + wx[ 8] = w2_t[0]; + wx[ 9] = w2_t[1]; + wx[10] = w2_t[2]; + wx[11] = w2_t[3]; + wx[12] = w3_t[0]; + wx[13] = w3_t[1]; + wx[14] = w3_t[2]; + wx[15] = w3_t[3]; - w0_t[0] = w0[0]; - w0_t[1] = w0[1]; - w0_t[2] = w0[2]; - w0_t[3] = w0[3]; - w1_t[0] = w1[0]; - w1_t[1] = w1[1]; - w1_t[2] = w1[2]; - w1_t[3] = w1[3]; - w2_t[0] = w2[0]; - w2_t[1] = w2[1]; - w2_t[2] = w2[2]; - w2_t[3] = w2[3]; - w3_t[0] = w3[0]; - w3_t[1] = w3[1]; - w3_t[2] = w3[2]; - w3_t[3] = w3[3]; + overwrite_at_le (wx, w0lr, salt_len); - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; + w0_t[0] = wx[ 0]; + w0_t[1] = wx[ 1]; + w0_t[2] = wx[ 2]; + w0_t[3] = wx[ 3]; + w1_t[0] = wx[ 4]; + w1_t[1] = wx[ 5]; + w1_t[2] = wx[ 6]; + w1_t[3] = wx[ 7]; + w2_t[0] = wx[ 8]; + w2_t[1] = wx[ 9]; + w2_t[2] = wx[10]; + w2_t[3] = wx[11]; + w3_t[0] = wx[12]; + w3_t[1] = wx[13]; w3_t[2] = pw_salt_len * 8; - - w0_t[0] |= salt_buf0[0]; - w0_t[1] |= salt_buf0[1]; - w0_t[2] |= salt_buf0[2]; - w0_t[3] |= salt_buf0[3]; - w1_t[0] |= salt_buf1[0]; - w1_t[1] |= salt_buf1[1]; - w1_t[2] |= salt_buf1[2]; - w1_t[3] |= salt_buf1[3]; - w2_t[0] |= salt_buf2[0]; - w2_t[1] |= salt_buf2[1]; - w2_t[2] |= salt_buf2[2]; - w2_t[3] |= salt_buf2[3]; - w3_t[0] |= salt_buf3[0]; - w3_t[1] |= salt_buf3[1]; - w3_t[2] |= salt_buf3[2]; - w3_t[3] |= salt_buf3[3]; + w3_t[3] = 0; /** * md5 */ - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32x tmp2; + + u32x a = MD5M_A; + u32x b = MD5M_B; + u32x c = MD5M_C; + u32x d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); @@ -167,22 +210,22 @@ static void m00040m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); + MD5_STEP (MD5_H1, a, b, c, d, w1_t[1], MD5C20, MD5S20); + MD5_STEP (MD5_H2, d, a, b, c, w2_t[0], MD5C21, MD5S21); + MD5_STEP (MD5_H1, c, d, a, b, w2_t[3], MD5C22, MD5S22); + MD5_STEP (MD5_H2, b, c, d, a, w3_t[2], MD5C23, MD5S23); + MD5_STEP (MD5_H1, a, b, c, d, w0_t[1], MD5C24, MD5S20); + MD5_STEP (MD5_H2, d, a, b, c, w1_t[0], MD5C25, MD5S21); + MD5_STEP (MD5_H1, c, d, a, b, w1_t[3], MD5C26, MD5S22); + MD5_STEP (MD5_H2, b, c, d, a, w2_t[2], MD5C27, MD5S23); + MD5_STEP (MD5_H1, a, b, c, d, w3_t[1], MD5C28, MD5S20); + MD5_STEP (MD5_H2, d, a, b, c, w0_t[0], MD5C29, MD5S21); + MD5_STEP (MD5_H1, c, d, a, b, w0_t[3], MD5C2a, MD5S22); + MD5_STEP (MD5_H2, b, c, d, a, w1_t[2], MD5C2b, MD5S23); + MD5_STEP (MD5_H1, a, b, c, d, w2_t[1], MD5C2c, MD5S20); + MD5_STEP (MD5_H2, d, a, b, c, w3_t[0], MD5C2d, MD5S21); + MD5_STEP (MD5_H1, c, d, a, b, w3_t[3], MD5C2e, MD5S22); + MD5_STEP (MD5_H2, b, c, d, a, w0_t[2], MD5C2f, MD5S23); MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); @@ -201,13 +244,7 @@ static void m00040m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - const u32 r0 = a; - const u32 r1 = d; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_M + COMPARE_M_SIMD (a, d, c, b); } } @@ -268,73 +305,116 @@ static void m00040s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le const u32 pw_salt_len = pw_len + salt_len; + /** + * prepend salt + */ + + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; + + w0_t[0] = w0[0]; + w0_t[1] = w0[1]; + w0_t[2] = w0[2]; + w0_t[3] = w0[3]; + w1_t[0] = w1[0]; + w1_t[1] = w1[1]; + w1_t[2] = w1[2]; + w1_t[3] = w1[3]; + w2_t[0] = w2[0]; + w2_t[1] = w2[1]; + w2_t[2] = w2[2]; + w2_t[3] = w2[3]; + w3_t[0] = w3[0]; + w3_t[1] = w3[1]; + w3_t[2] = w3[2]; + w3_t[3] = w3[3]; + + switch_buffer_by_offset_le_S (w0_t, w1_t, w2_t, w3_t, salt_len); + + w0_t[0] |= salt_buf0[0]; + w0_t[1] |= salt_buf0[1]; + w0_t[2] |= salt_buf0[2]; + w0_t[3] |= salt_buf0[3]; + w1_t[0] |= salt_buf1[0]; + w1_t[1] |= salt_buf1[1]; + w1_t[2] |= salt_buf1[2]; + w1_t[3] |= salt_buf1[3]; + w2_t[0] |= salt_buf2[0]; + w2_t[1] |= salt_buf2[1]; + w2_t[2] |= salt_buf2[2]; + w2_t[3] |= salt_buf2[3]; + w3_t[0] |= salt_buf3[0]; + w3_t[1] |= salt_buf3[1]; + w3_t[2] |= salt_buf3[2]; + w3_t[3] |= salt_buf3[3]; + /** * loop */ u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - /** - * prepend salt - */ + u32x wx[16]; - u32 w0_t[4]; - u32 w1_t[4]; - u32 w2_t[4]; - u32 w3_t[4]; + wx[ 0] = w0_t[0]; + wx[ 1] = w0_t[1]; + wx[ 2] = w0_t[2]; + wx[ 3] = w0_t[3]; + wx[ 4] = w1_t[0]; + wx[ 5] = w1_t[1]; + wx[ 6] = w1_t[2]; + wx[ 7] = w1_t[3]; + wx[ 8] = w2_t[0]; + wx[ 9] = w2_t[1]; + wx[10] = w2_t[2]; + wx[11] = w2_t[3]; + wx[12] = w3_t[0]; + wx[13] = w3_t[1]; + wx[14] = w3_t[2]; + wx[15] = w3_t[3]; - w0_t[0] = w0[0]; - w0_t[1] = w0[1]; - w0_t[2] = w0[2]; - w0_t[3] = w0[3]; - w1_t[0] = w1[0]; - w1_t[1] = w1[1]; - w1_t[2] = w1[2]; - w1_t[3] = w1[3]; - w2_t[0] = w2[0]; - w2_t[1] = w2[1]; - w2_t[2] = w2[2]; - w2_t[3] = w2[3]; - w3_t[0] = w3[0]; - w3_t[1] = w3[1]; - w3_t[2] = w3[2]; - w3_t[3] = w3[3]; + overwrite_at_le (wx, w0lr, salt_len); - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; + w0_t[0] = wx[ 0]; + w0_t[1] = wx[ 1]; + w0_t[2] = wx[ 2]; + w0_t[3] = wx[ 3]; + w1_t[0] = wx[ 4]; + w1_t[1] = wx[ 5]; + w1_t[2] = wx[ 6]; + w1_t[3] = wx[ 7]; + w2_t[0] = wx[ 8]; + w2_t[1] = wx[ 9]; + w2_t[2] = wx[10]; + w2_t[3] = wx[11]; + w3_t[0] = wx[12]; + w3_t[1] = wx[13]; w3_t[2] = pw_salt_len * 8; - - w0_t[0] |= salt_buf0[0]; - w0_t[1] |= salt_buf0[1]; - w0_t[2] |= salt_buf0[2]; - w0_t[3] |= salt_buf0[3]; - w1_t[0] |= salt_buf1[0]; - w1_t[1] |= salt_buf1[1]; - w1_t[2] |= salt_buf1[2]; - w1_t[3] |= salt_buf1[3]; - w2_t[0] |= salt_buf2[0]; - w2_t[1] |= salt_buf2[1]; - w2_t[2] |= salt_buf2[2]; - w2_t[3] |= salt_buf2[3]; - w3_t[0] |= salt_buf3[0]; - w3_t[1] |= salt_buf3[1]; - w3_t[2] |= salt_buf3[2]; - w3_t[3] |= salt_buf3[3]; + w3_t[3] = 0; /** * md5 */ - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32x tmp2; + + u32x a = MD5M_A; + u32x b = MD5M_B; + u32x c = MD5M_C; + u32x d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); @@ -370,22 +450,22 @@ static void m00040s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); + MD5_STEP (MD5_H1, a, b, c, d, w1_t[1], MD5C20, MD5S20); + MD5_STEP (MD5_H2, d, a, b, c, w2_t[0], MD5C21, MD5S21); + MD5_STEP (MD5_H1, c, d, a, b, w2_t[3], MD5C22, MD5S22); + MD5_STEP (MD5_H2, b, c, d, a, w3_t[2], MD5C23, MD5S23); + MD5_STEP (MD5_H1, a, b, c, d, w0_t[1], MD5C24, MD5S20); + MD5_STEP (MD5_H2, d, a, b, c, w1_t[0], MD5C25, MD5S21); + MD5_STEP (MD5_H1, c, d, a, b, w1_t[3], MD5C26, MD5S22); + MD5_STEP (MD5_H2, b, c, d, a, w2_t[2], MD5C27, MD5S23); + MD5_STEP (MD5_H1, a, b, c, d, w3_t[1], MD5C28, MD5S20); + MD5_STEP (MD5_H2, d, a, b, c, w0_t[0], MD5C29, MD5S21); + MD5_STEP (MD5_H1, c, d, a, b, w0_t[3], MD5C2a, MD5S22); + MD5_STEP (MD5_H2, b, c, d, a, w1_t[2], MD5C2b, MD5S23); + MD5_STEP (MD5_H1, a, b, c, d, w2_t[1], MD5C2c, MD5S20); + MD5_STEP (MD5_H2, d, a, b, c, w3_t[0], MD5C2d, MD5S21); + MD5_STEP (MD5_H1, c, d, a, b, w3_t[3], MD5C2e, MD5S22); + MD5_STEP (MD5_H2, b, c, d, a, w0_t[2], MD5C2f, MD5S23); MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); @@ -401,21 +481,13 @@ static void m00040s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - bool q_cond = allx (search[0] != a); - - if (q_cond) continue; + if (MATCHES_NONE_VS (a, search[0])) continue; MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - - const u32 r0 = a; - const u32 r1 = d; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_S + COMPARE_S_SIMD (a, d, c, b); } } diff --git a/OpenCL/m00050_a1.cl b/OpenCL/m00050_a1.cl index e787c6017..d4373fb09 100644 --- a/OpenCL/m00050_a1.cl +++ b/OpenCL/m00050_a1.cl @@ -253,7 +253,7 @@ __kernel void m00050_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -330,7 +330,7 @@ __kernel void m00050_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -484,7 +484,7 @@ __kernel void m00050_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -573,7 +573,7 @@ __kernel void m00050_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m00050_a3.cl b/OpenCL/m00050_a3.cl index a5d1ff4aa..470ba9a79 100644 --- a/OpenCL/m00050_a3.cl +++ b/OpenCL/m00050_a3.cl @@ -5,6 +5,8 @@ #define _MD5_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,35 +18,33 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" +#include "OpenCL/simd.c" -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" - -static void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) +static void md5_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[4]) { - u32 a = digest[0]; - u32 b = digest[1]; - u32 c = digest[2]; - u32 d = digest[3]; + u32x a = digest[0]; + u32x b = digest[1]; + u32x c = digest[2]; + u32x d = digest[3]; - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; + u32x w0_t = w0[0]; + u32x w1_t = w0[1]; + u32x w2_t = w0[2]; + u32x w3_t = w0[3]; + u32x w4_t = w1[0]; + u32x w5_t = w1[1]; + u32x w6_t = w1[2]; + u32x w7_t = w1[3]; + u32x w8_t = w2[0]; + u32x w9_t = w2[1]; + u32x wa_t = w2[2]; + u32x wb_t = w2[3]; + u32x wc_t = w3[0]; + u32x wd_t = w3[1]; + u32x we_t = w3[2]; + u32x wf_t = w3[3]; - u32 tmp2; + u32x tmp2; MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); @@ -120,7 +120,7 @@ static void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], co digest[3] += d; } -static void hmac_md5_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[4], u32 opad[4]) +static void hmac_md5_pad (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[4], u32x opad[4]) { w0[0] = w0[0] ^ 0x36363636; w0[1] = w0[1] ^ 0x36363636; @@ -171,7 +171,7 @@ static void hmac_md5_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[4 md5_transform (w0, w1, w2, w3, opad); } -static void hmac_md5_run (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[4], u32 opad[4], u32 digest[4]) +static void hmac_md5_run (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[4], u32x opad[4], u32x digest[4]) { digest[0] = ipad[0]; digest[1] = ipad[1]; @@ -254,46 +254,46 @@ static void m00050m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; /** * pads */ - u32 w0_t[4]; + u32x w0_t[4]; - w0_t[0] = w0[0]; + w0_t[0] = w0lr; w0_t[1] = w0[1]; w0_t[2] = w0[2]; w0_t[3] = w0[3]; - u32 w1_t[4]; + u32x w1_t[4]; w1_t[0] = w1[0]; w1_t[1] = w1[1]; w1_t[2] = w1[2]; w1_t[3] = w1[3]; - u32 w2_t[4]; + u32x w2_t[4]; w2_t[0] = w2[0]; w2_t[1] = w2[1]; w2_t[2] = w2[2]; w2_t[3] = w2[3]; - u32 w3_t[4]; + u32x w3_t[4]; w3_t[0] = w3[0]; w3_t[1] = w3[1]; w3_t[2] = 0; w3_t[3] = 0; - u32 ipad[4]; - u32 opad[4]; + u32x ipad[4]; + u32x opad[4]; hmac_md5_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); @@ -314,16 +314,11 @@ static void m00050m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w3_t[2] = (64 + salt_len) * 8; w3_t[3] = 0; - u32 digest[4]; + u32x digest[4]; hmac_md5_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); - const u32 r0 = digest[0]; - const u32 r1 = digest[3]; - const u32 r2 = digest[2]; - const u32 r3 = digest[1]; - - #include COMPARE_M + COMPARE_M_SIMD (digest[0], digest[3], digest[2], digest[1]); } } @@ -388,46 +383,46 @@ static void m00050s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; /** * pads */ - u32 w0_t[4]; + u32x w0_t[4]; - w0_t[0] = w0[0]; + w0_t[0] = w0lr; w0_t[1] = w0[1]; w0_t[2] = w0[2]; w0_t[3] = w0[3]; - u32 w1_t[4]; + u32x w1_t[4]; w1_t[0] = w1[0]; w1_t[1] = w1[1]; w1_t[2] = w1[2]; w1_t[3] = w1[3]; - u32 w2_t[4]; + u32x w2_t[4]; w2_t[0] = w2[0]; w2_t[1] = w2[1]; w2_t[2] = w2[2]; w2_t[3] = w2[3]; - u32 w3_t[4]; + u32x w3_t[4]; w3_t[0] = w3[0]; w3_t[1] = w3[1]; w3_t[2] = 0; w3_t[3] = 0; - u32 ipad[4]; - u32 opad[4]; + u32x ipad[4]; + u32x opad[4]; hmac_md5_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); @@ -448,16 +443,11 @@ static void m00050s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w3_t[2] = (64 + salt_len) * 8; w3_t[3] = 0; - u32 digest[4]; + u32x digest[4]; hmac_md5_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); - const u32 r0 = digest[0]; - const u32 r1 = digest[3]; - const u32 r2 = digest[2]; - const u32 r3 = digest[1]; - - #include COMPARE_S + COMPARE_S_SIMD (digest[0], digest[3], digest[2], digest[1]); } } diff --git a/OpenCL/m00060_a1.cl b/OpenCL/m00060_a1.cl index bae79e018..c97b0c377 100644 --- a/OpenCL/m00060_a1.cl +++ b/OpenCL/m00060_a1.cl @@ -253,7 +253,7 @@ __kernel void m00060_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -351,7 +351,7 @@ __kernel void m00060_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -470,7 +470,7 @@ __kernel void m00060_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -580,7 +580,7 @@ __kernel void m00060_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m00060_a3.cl b/OpenCL/m00060_a3.cl index 879dd7b00..ba1faf35e 100644 --- a/OpenCL/m00060_a3.cl +++ b/OpenCL/m00060_a3.cl @@ -5,6 +5,8 @@ #define _MD5_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,35 +18,33 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" +#include "OpenCL/simd.c" -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" - -static void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) +static void md5_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[4]) { - u32 a = digest[0]; - u32 b = digest[1]; - u32 c = digest[2]; - u32 d = digest[3]; + u32x a = digest[0]; + u32x b = digest[1]; + u32x c = digest[2]; + u32x d = digest[3]; - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; + u32x w0_t = w0[0]; + u32x w1_t = w0[1]; + u32x w2_t = w0[2]; + u32x w3_t = w0[3]; + u32x w4_t = w1[0]; + u32x w5_t = w1[1]; + u32x w6_t = w1[2]; + u32x w7_t = w1[3]; + u32x w8_t = w2[0]; + u32x w9_t = w2[1]; + u32x wa_t = w2[2]; + u32x wb_t = w2[3]; + u32x wc_t = w3[0]; + u32x wd_t = w3[1]; + u32x we_t = w3[2]; + u32x wf_t = w3[3]; - u32 tmp2; + u32x tmp2; MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); @@ -120,7 +120,7 @@ static void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], co digest[3] += d; } -static void hmac_md5_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[4], u32 opad[4]) +static void hmac_md5_pad (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[4], u32x opad[4]) { w0[0] = w0[0] ^ 0x36363636; w0[1] = w0[1] ^ 0x36363636; @@ -171,7 +171,7 @@ static void hmac_md5_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[4 md5_transform (w0, w1, w2, w3, opad); } -static void hmac_md5_run (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[4], u32 opad[4], u32 digest[4]) +static void hmac_md5_run (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[4], u32x opad[4], u32x digest[4]) { digest[0] = ipad[0]; digest[1] = ipad[1]; @@ -236,36 +236,36 @@ static void m00060m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * pads */ - u32 w0_t[4]; + u32x w0_t[4]; w0_t[0] = salt_buf0[0]; w0_t[1] = salt_buf0[1]; w0_t[2] = salt_buf0[2]; w0_t[3] = salt_buf0[3]; - u32 w1_t[4]; + u32x w1_t[4]; w1_t[0] = salt_buf1[0]; w1_t[1] = salt_buf1[1]; w1_t[2] = salt_buf1[2]; w1_t[3] = salt_buf1[3]; - u32 w2_t[4]; + u32x w2_t[4]; w2_t[0] = 0; w2_t[1] = 0; w2_t[2] = 0; w2_t[3] = 0; - u32 w3_t[4]; + u32x w3_t[4]; w3_t[0] = 0; w3_t[1] = 0; w3_t[2] = 0; w3_t[3] = 0; - u32 ipad[4]; - u32 opad[4]; + u32x ipad[4]; + u32x opad[4]; hmac_md5_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); @@ -275,15 +275,13 @@ static void m00060m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0rl = w0r | w0l; - append_0x80_4x4 (w0, w1, w2, w3, pw_len); - - w0_t[0] = w0[0]; + w0_t[0] = w0rl; w0_t[1] = w0[1]; w0_t[2] = w0[2]; w0_t[3] = w0[3]; @@ -300,16 +298,13 @@ static void m00060m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w3_t[2] = (64 + pw_len) * 8; w3_t[3] = 0; - u32 digest[4]; + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, pw_len); + + u32x digest[4]; hmac_md5_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); - const u32 r0 = digest[0]; - const u32 r1 = digest[3]; - const u32 r2 = digest[2]; - const u32 r3 = digest[1]; - - #include COMPARE_M + COMPARE_M_SIMD (digest[0], digest[3], digest[2], digest[1]); } } @@ -344,36 +339,36 @@ static void m00060s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * pads */ - u32 w0_t[4]; + u32x w0_t[4]; w0_t[0] = salt_buf0[0]; w0_t[1] = salt_buf0[1]; w0_t[2] = salt_buf0[2]; w0_t[3] = salt_buf0[3]; - u32 w1_t[4]; + u32x w1_t[4]; w1_t[0] = salt_buf1[0]; w1_t[1] = salt_buf1[1]; w1_t[2] = salt_buf1[2]; w1_t[3] = salt_buf1[3]; - u32 w2_t[4]; + u32x w2_t[4]; w2_t[0] = 0; w2_t[1] = 0; w2_t[2] = 0; w2_t[3] = 0; - u32 w3_t[4]; + u32x w3_t[4]; w3_t[0] = 0; w3_t[1] = 0; w3_t[2] = 0; w3_t[3] = 0; - u32 ipad[4]; - u32 opad[4]; + u32x ipad[4]; + u32x opad[4]; hmac_md5_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); @@ -395,15 +390,13 @@ static void m00060s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0rl = w0r | w0l; - append_0x80_4x4 (w0, w1, w2, w3, pw_len); - - w0_t[0] = w0[0]; + w0_t[0] = w0rl; w0_t[1] = w0[1]; w0_t[2] = w0[2]; w0_t[3] = w0[3]; @@ -420,16 +413,13 @@ static void m00060s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w3_t[2] = (64 + pw_len) * 8; w3_t[3] = 0; - u32 digest[4]; + append_0x80_4x4 (w0_t, w1_t, w2_t, w3_t, pw_len); + + u32x digest[4]; hmac_md5_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); - const u32 r0 = digest[0]; - const u32 r1 = digest[3]; - const u32 r2 = digest[2]; - const u32 r3 = digest[1]; - - #include COMPARE_S + COMPARE_S_SIMD (digest[0], digest[3], digest[2], digest[1]); } } diff --git a/OpenCL/m00100_a1.cl b/OpenCL/m00100_a1.cl index 79bac978f..a98d2a3ef 100644 --- a/OpenCL/m00100_a1.cl +++ b/OpenCL/m00100_a1.cl @@ -70,7 +70,7 @@ __kernel void m00100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -115,7 +115,7 @@ __kernel void m00100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordr0, wordr1, pw_r_len); - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -337,7 +337,7 @@ __kernel void m00100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -400,7 +400,7 @@ __kernel void m00100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordr0, wordr1, pw_r_len); - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m00100_a3.cl b/OpenCL/m00100_a3.cl index 77e610c84..4b8439e5e 100644 --- a/OpenCL/m00100_a3.cl +++ b/OpenCL/m00100_a3.cl @@ -5,6 +5,8 @@ #define _SHA1_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,11 +18,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" +#include "OpenCL/simd.c" -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" - -static void m00100m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m00100m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -33,66 +33,66 @@ static void m00100m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k * base */ - const u32 c_16s = rotl32 ((w[13] ^ w[ 8] ^ w[ 2] ), 1u); - const u32 c_17s = rotl32 ((w[14] ^ w[ 9] ^ w[ 3] ^ w[ 1]), 1u); - const u32 c_18s = rotl32 ((w[15] ^ w[10] ^ w[ 4] ^ w[ 2]), 1u); - const u32 c_19s = rotl32 ((c_16s ^ w[11] ^ w[ 5] ^ w[ 3]), 1u); - const u32 c_20s = rotl32 ((c_17s ^ w[12] ^ w[ 6] ^ w[ 4]), 1u); - const u32 c_21s = rotl32 ((c_18s ^ w[13] ^ w[ 7] ^ w[ 5]), 1u); - const u32 c_22s = rotl32 ((c_19s ^ w[14] ^ w[ 8] ^ w[ 6]), 1u); - const u32 c_23s = rotl32 ((c_20s ^ w[15] ^ w[ 9] ^ w[ 7]), 1u); - const u32 c_24s = rotl32 ((c_21s ^ c_16s ^ w[10] ^ w[ 8]), 1u); - const u32 c_25s = rotl32 ((c_22s ^ c_17s ^ w[11] ^ w[ 9]), 1u); - const u32 c_26s = rotl32 ((c_23s ^ c_18s ^ w[12] ^ w[10]), 1u); - const u32 c_27s = rotl32 ((c_24s ^ c_19s ^ w[13] ^ w[11]), 1u); - const u32 c_28s = rotl32 ((c_25s ^ c_20s ^ w[14] ^ w[12]), 1u); - const u32 c_29s = rotl32 ((c_26s ^ c_21s ^ w[15] ^ w[13]), 1u); - const u32 c_30s = rotl32 ((c_27s ^ c_22s ^ c_16s ^ w[14]), 1u); - const u32 c_31s = rotl32 ((c_28s ^ c_23s ^ c_17s ^ w[15]), 1u); - const u32 c_32s = rotl32 ((c_29s ^ c_24s ^ c_18s ^ c_16s), 1u); - const u32 c_33s = rotl32 ((c_30s ^ c_25s ^ c_19s ^ c_17s), 1u); - const u32 c_34s = rotl32 ((c_31s ^ c_26s ^ c_20s ^ c_18s), 1u); - const u32 c_35s = rotl32 ((c_32s ^ c_27s ^ c_21s ^ c_19s), 1u); - const u32 c_36s = rotl32 ((c_33s ^ c_28s ^ c_22s ^ c_20s), 1u); - const u32 c_37s = rotl32 ((c_34s ^ c_29s ^ c_23s ^ c_21s), 1u); - const u32 c_38s = rotl32 ((c_35s ^ c_30s ^ c_24s ^ c_22s), 1u); - const u32 c_39s = rotl32 ((c_36s ^ c_31s ^ c_25s ^ c_23s), 1u); - const u32 c_40s = rotl32 ((c_37s ^ c_32s ^ c_26s ^ c_24s), 1u); - const u32 c_41s = rotl32 ((c_38s ^ c_33s ^ c_27s ^ c_25s), 1u); - const u32 c_42s = rotl32 ((c_39s ^ c_34s ^ c_28s ^ c_26s), 1u); - const u32 c_43s = rotl32 ((c_40s ^ c_35s ^ c_29s ^ c_27s), 1u); - const u32 c_44s = rotl32 ((c_41s ^ c_36s ^ c_30s ^ c_28s), 1u); - const u32 c_45s = rotl32 ((c_42s ^ c_37s ^ c_31s ^ c_29s), 1u); - const u32 c_46s = rotl32 ((c_43s ^ c_38s ^ c_32s ^ c_30s), 1u); - const u32 c_47s = rotl32 ((c_44s ^ c_39s ^ c_33s ^ c_31s), 1u); - const u32 c_48s = rotl32 ((c_45s ^ c_40s ^ c_34s ^ c_32s), 1u); - const u32 c_49s = rotl32 ((c_46s ^ c_41s ^ c_35s ^ c_33s), 1u); - const u32 c_50s = rotl32 ((c_47s ^ c_42s ^ c_36s ^ c_34s), 1u); - const u32 c_51s = rotl32 ((c_48s ^ c_43s ^ c_37s ^ c_35s), 1u); - const u32 c_52s = rotl32 ((c_49s ^ c_44s ^ c_38s ^ c_36s), 1u); - const u32 c_53s = rotl32 ((c_50s ^ c_45s ^ c_39s ^ c_37s), 1u); - const u32 c_54s = rotl32 ((c_51s ^ c_46s ^ c_40s ^ c_38s), 1u); - const u32 c_55s = rotl32 ((c_52s ^ c_47s ^ c_41s ^ c_39s), 1u); - const u32 c_56s = rotl32 ((c_53s ^ c_48s ^ c_42s ^ c_40s), 1u); - const u32 c_57s = rotl32 ((c_54s ^ c_49s ^ c_43s ^ c_41s), 1u); - const u32 c_58s = rotl32 ((c_55s ^ c_50s ^ c_44s ^ c_42s), 1u); - const u32 c_59s = rotl32 ((c_56s ^ c_51s ^ c_45s ^ c_43s), 1u); - const u32 c_60s = rotl32 ((c_57s ^ c_52s ^ c_46s ^ c_44s), 1u); - const u32 c_61s = rotl32 ((c_58s ^ c_53s ^ c_47s ^ c_45s), 1u); - const u32 c_62s = rotl32 ((c_59s ^ c_54s ^ c_48s ^ c_46s), 1u); - const u32 c_63s = rotl32 ((c_60s ^ c_55s ^ c_49s ^ c_47s), 1u); - const u32 c_64s = rotl32 ((c_61s ^ c_56s ^ c_50s ^ c_48s), 1u); - const u32 c_65s = rotl32 ((c_62s ^ c_57s ^ c_51s ^ c_49s), 1u); - const u32 c_66s = rotl32 ((c_63s ^ c_58s ^ c_52s ^ c_50s), 1u); - const u32 c_67s = rotl32 ((c_64s ^ c_59s ^ c_53s ^ c_51s), 1u); - const u32 c_68s = rotl32 ((c_65s ^ c_60s ^ c_54s ^ c_52s), 1u); - const u32 c_69s = rotl32 ((c_66s ^ c_61s ^ c_55s ^ c_53s), 1u); - const u32 c_70s = rotl32 ((c_67s ^ c_62s ^ c_56s ^ c_54s), 1u); - const u32 c_71s = rotl32 ((c_68s ^ c_63s ^ c_57s ^ c_55s), 1u); - const u32 c_72s = rotl32 ((c_69s ^ c_64s ^ c_58s ^ c_56s), 1u); - const u32 c_73s = rotl32 ((c_70s ^ c_65s ^ c_59s ^ c_57s), 1u); - const u32 c_74s = rotl32 ((c_71s ^ c_66s ^ c_60s ^ c_58s), 1u); - const u32 c_75s = rotl32 ((c_72s ^ c_67s ^ c_61s ^ c_59s), 1u); + const u32 c_16s = rotl32_S ((w[13] ^ w[ 8] ^ w[ 2] ), 1u); + const u32 c_17s = rotl32_S ((w[14] ^ w[ 9] ^ w[ 3] ^ w[ 1]), 1u); + const u32 c_18s = rotl32_S ((w[15] ^ w[10] ^ w[ 4] ^ w[ 2]), 1u); + const u32 c_19s = rotl32_S ((c_16s ^ w[11] ^ w[ 5] ^ w[ 3]), 1u); + const u32 c_20s = rotl32_S ((c_17s ^ w[12] ^ w[ 6] ^ w[ 4]), 1u); + const u32 c_21s = rotl32_S ((c_18s ^ w[13] ^ w[ 7] ^ w[ 5]), 1u); + const u32 c_22s = rotl32_S ((c_19s ^ w[14] ^ w[ 8] ^ w[ 6]), 1u); + const u32 c_23s = rotl32_S ((c_20s ^ w[15] ^ w[ 9] ^ w[ 7]), 1u); + const u32 c_24s = rotl32_S ((c_21s ^ c_16s ^ w[10] ^ w[ 8]), 1u); + const u32 c_25s = rotl32_S ((c_22s ^ c_17s ^ w[11] ^ w[ 9]), 1u); + const u32 c_26s = rotl32_S ((c_23s ^ c_18s ^ w[12] ^ w[10]), 1u); + const u32 c_27s = rotl32_S ((c_24s ^ c_19s ^ w[13] ^ w[11]), 1u); + const u32 c_28s = rotl32_S ((c_25s ^ c_20s ^ w[14] ^ w[12]), 1u); + const u32 c_29s = rotl32_S ((c_26s ^ c_21s ^ w[15] ^ w[13]), 1u); + const u32 c_30s = rotl32_S ((c_27s ^ c_22s ^ c_16s ^ w[14]), 1u); + const u32 c_31s = rotl32_S ((c_28s ^ c_23s ^ c_17s ^ w[15]), 1u); + const u32 c_32s = rotl32_S ((c_29s ^ c_24s ^ c_18s ^ c_16s), 1u); + const u32 c_33s = rotl32_S ((c_30s ^ c_25s ^ c_19s ^ c_17s), 1u); + const u32 c_34s = rotl32_S ((c_31s ^ c_26s ^ c_20s ^ c_18s), 1u); + const u32 c_35s = rotl32_S ((c_32s ^ c_27s ^ c_21s ^ c_19s), 1u); + const u32 c_36s = rotl32_S ((c_33s ^ c_28s ^ c_22s ^ c_20s), 1u); + const u32 c_37s = rotl32_S ((c_34s ^ c_29s ^ c_23s ^ c_21s), 1u); + const u32 c_38s = rotl32_S ((c_35s ^ c_30s ^ c_24s ^ c_22s), 1u); + const u32 c_39s = rotl32_S ((c_36s ^ c_31s ^ c_25s ^ c_23s), 1u); + const u32 c_40s = rotl32_S ((c_37s ^ c_32s ^ c_26s ^ c_24s), 1u); + const u32 c_41s = rotl32_S ((c_38s ^ c_33s ^ c_27s ^ c_25s), 1u); + const u32 c_42s = rotl32_S ((c_39s ^ c_34s ^ c_28s ^ c_26s), 1u); + const u32 c_43s = rotl32_S ((c_40s ^ c_35s ^ c_29s ^ c_27s), 1u); + const u32 c_44s = rotl32_S ((c_41s ^ c_36s ^ c_30s ^ c_28s), 1u); + const u32 c_45s = rotl32_S ((c_42s ^ c_37s ^ c_31s ^ c_29s), 1u); + const u32 c_46s = rotl32_S ((c_43s ^ c_38s ^ c_32s ^ c_30s), 1u); + const u32 c_47s = rotl32_S ((c_44s ^ c_39s ^ c_33s ^ c_31s), 1u); + const u32 c_48s = rotl32_S ((c_45s ^ c_40s ^ c_34s ^ c_32s), 1u); + const u32 c_49s = rotl32_S ((c_46s ^ c_41s ^ c_35s ^ c_33s), 1u); + const u32 c_50s = rotl32_S ((c_47s ^ c_42s ^ c_36s ^ c_34s), 1u); + const u32 c_51s = rotl32_S ((c_48s ^ c_43s ^ c_37s ^ c_35s), 1u); + const u32 c_52s = rotl32_S ((c_49s ^ c_44s ^ c_38s ^ c_36s), 1u); + const u32 c_53s = rotl32_S ((c_50s ^ c_45s ^ c_39s ^ c_37s), 1u); + const u32 c_54s = rotl32_S ((c_51s ^ c_46s ^ c_40s ^ c_38s), 1u); + const u32 c_55s = rotl32_S ((c_52s ^ c_47s ^ c_41s ^ c_39s), 1u); + const u32 c_56s = rotl32_S ((c_53s ^ c_48s ^ c_42s ^ c_40s), 1u); + const u32 c_57s = rotl32_S ((c_54s ^ c_49s ^ c_43s ^ c_41s), 1u); + const u32 c_58s = rotl32_S ((c_55s ^ c_50s ^ c_44s ^ c_42s), 1u); + const u32 c_59s = rotl32_S ((c_56s ^ c_51s ^ c_45s ^ c_43s), 1u); + const u32 c_60s = rotl32_S ((c_57s ^ c_52s ^ c_46s ^ c_44s), 1u); + const u32 c_61s = rotl32_S ((c_58s ^ c_53s ^ c_47s ^ c_45s), 1u); + const u32 c_62s = rotl32_S ((c_59s ^ c_54s ^ c_48s ^ c_46s), 1u); + const u32 c_63s = rotl32_S ((c_60s ^ c_55s ^ c_49s ^ c_47s), 1u); + const u32 c_64s = rotl32_S ((c_61s ^ c_56s ^ c_50s ^ c_48s), 1u); + const u32 c_65s = rotl32_S ((c_62s ^ c_57s ^ c_51s ^ c_49s), 1u); + const u32 c_66s = rotl32_S ((c_63s ^ c_58s ^ c_52s ^ c_50s), 1u); + const u32 c_67s = rotl32_S ((c_64s ^ c_59s ^ c_53s ^ c_51s), 1u); + const u32 c_68s = rotl32_S ((c_65s ^ c_60s ^ c_54s ^ c_52s), 1u); + const u32 c_69s = rotl32_S ((c_66s ^ c_61s ^ c_55s ^ c_53s), 1u); + const u32 c_70s = rotl32_S ((c_67s ^ c_62s ^ c_56s ^ c_54s), 1u); + const u32 c_71s = rotl32_S ((c_68s ^ c_63s ^ c_57s ^ c_55s), 1u); + const u32 c_72s = rotl32_S ((c_69s ^ c_64s ^ c_58s ^ c_56s), 1u); + const u32 c_73s = rotl32_S ((c_70s ^ c_65s ^ c_59s ^ c_57s), 1u); + const u32 c_74s = rotl32_S ((c_71s ^ c_66s ^ c_60s ^ c_58s), 1u); + const u32 c_75s = rotl32_S ((c_72s ^ c_67s ^ c_61s ^ c_59s), 1u); const u32 c_17sK = c_17s + SHA1C00; const u32 c_18sK = c_18s + SHA1C00; @@ -116,43 +116,43 @@ static void m00100m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - const u32 w0s01 = rotl32 (w0, 1u); - const u32 w0s02 = rotl32 (w0, 2u); - const u32 w0s03 = rotl32 (w0, 3u); - const u32 w0s04 = rotl32 (w0, 4u); - const u32 w0s05 = rotl32 (w0, 5u); - const u32 w0s06 = rotl32 (w0, 6u); - const u32 w0s07 = rotl32 (w0, 7u); - const u32 w0s08 = rotl32 (w0, 8u); - const u32 w0s09 = rotl32 (w0, 9u); - const u32 w0s10 = rotl32 (w0, 10u); - const u32 w0s11 = rotl32 (w0, 11u); - const u32 w0s12 = rotl32 (w0, 12u); - const u32 w0s13 = rotl32 (w0, 13u); - const u32 w0s14 = rotl32 (w0, 14u); - const u32 w0s15 = rotl32 (w0, 15u); - const u32 w0s16 = rotl32 (w0, 16u); - const u32 w0s17 = rotl32 (w0, 17u); - const u32 w0s18 = rotl32 (w0, 18u); - const u32 w0s19 = rotl32 (w0, 19u); - const u32 w0s20 = rotl32 (w0, 20u); + const u32x w0s01 = rotl32 (w0, 1u); + const u32x w0s02 = rotl32 (w0, 2u); + const u32x w0s03 = rotl32 (w0, 3u); + const u32x w0s04 = rotl32 (w0, 4u); + const u32x w0s05 = rotl32 (w0, 5u); + const u32x w0s06 = rotl32 (w0, 6u); + const u32x w0s07 = rotl32 (w0, 7u); + const u32x w0s08 = rotl32 (w0, 8u); + const u32x w0s09 = rotl32 (w0, 9u); + const u32x w0s10 = rotl32 (w0, 10u); + const u32x w0s11 = rotl32 (w0, 11u); + const u32x w0s12 = rotl32 (w0, 12u); + const u32x w0s13 = rotl32 (w0, 13u); + const u32x w0s14 = rotl32 (w0, 14u); + const u32x w0s15 = rotl32 (w0, 15u); + const u32x w0s16 = rotl32 (w0, 16u); + const u32x w0s17 = rotl32 (w0, 17u); + const u32x w0s18 = rotl32 (w0, 18u); + const u32x w0s19 = rotl32 (w0, 19u); + const u32x w0s20 = rotl32 (w0, 20u); - const u32 w0s04___w0s06 = w0s04 ^ w0s06; - const u32 w0s04___w0s08 = w0s04 ^ w0s08; - const u32 w0s08___w0s12 = w0s08 ^ w0s12; - const u32 w0s04___w0s06___w0s07 = w0s04___w0s06 ^ w0s07; + const u32x w0s04___w0s06 = w0s04 ^ w0s06; + const u32x w0s04___w0s08 = w0s04 ^ w0s08; + const u32x w0s08___w0s12 = w0s08 ^ w0s12; + const u32x w0s04___w0s06___w0s07 = w0s04___w0s06 ^ w0s07; - u32 a = SHA1M_A; - u32 b = SHA1M_B; - u32 c = SHA1M_C; - u32 d = SHA1M_D; - u32 e = SHA1M_E; + u32x a = SHA1M_A; + u32x b = SHA1M_B; + u32x c = SHA1M_C; + u32x d = SHA1M_D; + u32x e = SHA1M_E; #undef K #define K SHA1C00 @@ -247,30 +247,24 @@ static void m00100m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k SHA1_STEP (SHA1_F1 , b, c, d, e, a, (c_74s ^ w0s08 ^ w0s16)); SHA1_STEP (SHA1_F1 , a, b, c, d, e, (c_75s ^ w0s06 ^ w0s12 ^ w0s14)); - const u32 c_76s = rotl32 ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u); - const u32 c_77s = rotl32 ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u); - const u32 c_78s = rotl32 ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u); - const u32 c_79s = rotl32 ((c_76s ^ c_71s ^ c_65s ^ c_63s), 1u); + const u32x c_76s = rotl32 ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u); + const u32x c_77s = rotl32 ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u); + const u32x c_78s = rotl32 ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u); + const u32x c_79s = rotl32 ((c_76s ^ c_71s ^ c_65s ^ c_63s), 1u); - const u32 w0s21 = rotl32 (w0, 21u); - const u32 w0s22 = rotl32 (w0, 22U); + const u32x w0s21 = rotl32 (w0, 21u); + const u32x w0s22 = rotl32 (w0, 22U); SHA1_STEP (SHA1_F1 , e, a, b, c, d, (c_76s ^ w0s07 ^ w0s08___w0s12 ^ w0s16 ^ w0s21)); SHA1_STEP (SHA1_F1 , d, e, a, b, c, (c_77s)); SHA1_STEP (SHA1_F1 , c, d, e, a, b, (c_78s ^ w0s07 ^ w0s08 ^ w0s15 ^ w0s18 ^ w0s20)); SHA1_STEP (SHA1_F1 , b, c, d, e, a, (c_79s ^ w0s08 ^ w0s22)); - - const u32 r0 = d; - const u32 r1 = e; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_M + COMPARE_M_SIMD (d, e, c, b); } } -static void m00100s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m00100s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -283,66 +277,66 @@ static void m00100s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k * base */ - const u32 c_16s = rotl32 ((w[13] ^ w[ 8] ^ w[ 2] ), 1u); - const u32 c_17s = rotl32 ((w[14] ^ w[ 9] ^ w[ 3] ^ w[ 1]), 1u); - const u32 c_18s = rotl32 ((w[15] ^ w[10] ^ w[ 4] ^ w[ 2]), 1u); - const u32 c_19s = rotl32 ((c_16s ^ w[11] ^ w[ 5] ^ w[ 3]), 1u); - const u32 c_20s = rotl32 ((c_17s ^ w[12] ^ w[ 6] ^ w[ 4]), 1u); - const u32 c_21s = rotl32 ((c_18s ^ w[13] ^ w[ 7] ^ w[ 5]), 1u); - const u32 c_22s = rotl32 ((c_19s ^ w[14] ^ w[ 8] ^ w[ 6]), 1u); - const u32 c_23s = rotl32 ((c_20s ^ w[15] ^ w[ 9] ^ w[ 7]), 1u); - const u32 c_24s = rotl32 ((c_21s ^ c_16s ^ w[10] ^ w[ 8]), 1u); - const u32 c_25s = rotl32 ((c_22s ^ c_17s ^ w[11] ^ w[ 9]), 1u); - const u32 c_26s = rotl32 ((c_23s ^ c_18s ^ w[12] ^ w[10]), 1u); - const u32 c_27s = rotl32 ((c_24s ^ c_19s ^ w[13] ^ w[11]), 1u); - const u32 c_28s = rotl32 ((c_25s ^ c_20s ^ w[14] ^ w[12]), 1u); - const u32 c_29s = rotl32 ((c_26s ^ c_21s ^ w[15] ^ w[13]), 1u); - const u32 c_30s = rotl32 ((c_27s ^ c_22s ^ c_16s ^ w[14]), 1u); - const u32 c_31s = rotl32 ((c_28s ^ c_23s ^ c_17s ^ w[15]), 1u); - const u32 c_32s = rotl32 ((c_29s ^ c_24s ^ c_18s ^ c_16s), 1u); - const u32 c_33s = rotl32 ((c_30s ^ c_25s ^ c_19s ^ c_17s), 1u); - const u32 c_34s = rotl32 ((c_31s ^ c_26s ^ c_20s ^ c_18s), 1u); - const u32 c_35s = rotl32 ((c_32s ^ c_27s ^ c_21s ^ c_19s), 1u); - const u32 c_36s = rotl32 ((c_33s ^ c_28s ^ c_22s ^ c_20s), 1u); - const u32 c_37s = rotl32 ((c_34s ^ c_29s ^ c_23s ^ c_21s), 1u); - const u32 c_38s = rotl32 ((c_35s ^ c_30s ^ c_24s ^ c_22s), 1u); - const u32 c_39s = rotl32 ((c_36s ^ c_31s ^ c_25s ^ c_23s), 1u); - const u32 c_40s = rotl32 ((c_37s ^ c_32s ^ c_26s ^ c_24s), 1u); - const u32 c_41s = rotl32 ((c_38s ^ c_33s ^ c_27s ^ c_25s), 1u); - const u32 c_42s = rotl32 ((c_39s ^ c_34s ^ c_28s ^ c_26s), 1u); - const u32 c_43s = rotl32 ((c_40s ^ c_35s ^ c_29s ^ c_27s), 1u); - const u32 c_44s = rotl32 ((c_41s ^ c_36s ^ c_30s ^ c_28s), 1u); - const u32 c_45s = rotl32 ((c_42s ^ c_37s ^ c_31s ^ c_29s), 1u); - const u32 c_46s = rotl32 ((c_43s ^ c_38s ^ c_32s ^ c_30s), 1u); - const u32 c_47s = rotl32 ((c_44s ^ c_39s ^ c_33s ^ c_31s), 1u); - const u32 c_48s = rotl32 ((c_45s ^ c_40s ^ c_34s ^ c_32s), 1u); - const u32 c_49s = rotl32 ((c_46s ^ c_41s ^ c_35s ^ c_33s), 1u); - const u32 c_50s = rotl32 ((c_47s ^ c_42s ^ c_36s ^ c_34s), 1u); - const u32 c_51s = rotl32 ((c_48s ^ c_43s ^ c_37s ^ c_35s), 1u); - const u32 c_52s = rotl32 ((c_49s ^ c_44s ^ c_38s ^ c_36s), 1u); - const u32 c_53s = rotl32 ((c_50s ^ c_45s ^ c_39s ^ c_37s), 1u); - const u32 c_54s = rotl32 ((c_51s ^ c_46s ^ c_40s ^ c_38s), 1u); - const u32 c_55s = rotl32 ((c_52s ^ c_47s ^ c_41s ^ c_39s), 1u); - const u32 c_56s = rotl32 ((c_53s ^ c_48s ^ c_42s ^ c_40s), 1u); - const u32 c_57s = rotl32 ((c_54s ^ c_49s ^ c_43s ^ c_41s), 1u); - const u32 c_58s = rotl32 ((c_55s ^ c_50s ^ c_44s ^ c_42s), 1u); - const u32 c_59s = rotl32 ((c_56s ^ c_51s ^ c_45s ^ c_43s), 1u); - const u32 c_60s = rotl32 ((c_57s ^ c_52s ^ c_46s ^ c_44s), 1u); - const u32 c_61s = rotl32 ((c_58s ^ c_53s ^ c_47s ^ c_45s), 1u); - const u32 c_62s = rotl32 ((c_59s ^ c_54s ^ c_48s ^ c_46s), 1u); - const u32 c_63s = rotl32 ((c_60s ^ c_55s ^ c_49s ^ c_47s), 1u); - const u32 c_64s = rotl32 ((c_61s ^ c_56s ^ c_50s ^ c_48s), 1u); - const u32 c_65s = rotl32 ((c_62s ^ c_57s ^ c_51s ^ c_49s), 1u); - const u32 c_66s = rotl32 ((c_63s ^ c_58s ^ c_52s ^ c_50s), 1u); - const u32 c_67s = rotl32 ((c_64s ^ c_59s ^ c_53s ^ c_51s), 1u); - const u32 c_68s = rotl32 ((c_65s ^ c_60s ^ c_54s ^ c_52s), 1u); - const u32 c_69s = rotl32 ((c_66s ^ c_61s ^ c_55s ^ c_53s), 1u); - const u32 c_70s = rotl32 ((c_67s ^ c_62s ^ c_56s ^ c_54s), 1u); - const u32 c_71s = rotl32 ((c_68s ^ c_63s ^ c_57s ^ c_55s), 1u); - const u32 c_72s = rotl32 ((c_69s ^ c_64s ^ c_58s ^ c_56s), 1u); - const u32 c_73s = rotl32 ((c_70s ^ c_65s ^ c_59s ^ c_57s), 1u); - const u32 c_74s = rotl32 ((c_71s ^ c_66s ^ c_60s ^ c_58s), 1u); - const u32 c_75s = rotl32 ((c_72s ^ c_67s ^ c_61s ^ c_59s), 1u); + const u32 c_16s = rotl32_S ((w[13] ^ w[ 8] ^ w[ 2] ), 1u); + const u32 c_17s = rotl32_S ((w[14] ^ w[ 9] ^ w[ 3] ^ w[ 1]), 1u); + const u32 c_18s = rotl32_S ((w[15] ^ w[10] ^ w[ 4] ^ w[ 2]), 1u); + const u32 c_19s = rotl32_S ((c_16s ^ w[11] ^ w[ 5] ^ w[ 3]), 1u); + const u32 c_20s = rotl32_S ((c_17s ^ w[12] ^ w[ 6] ^ w[ 4]), 1u); + const u32 c_21s = rotl32_S ((c_18s ^ w[13] ^ w[ 7] ^ w[ 5]), 1u); + const u32 c_22s = rotl32_S ((c_19s ^ w[14] ^ w[ 8] ^ w[ 6]), 1u); + const u32 c_23s = rotl32_S ((c_20s ^ w[15] ^ w[ 9] ^ w[ 7]), 1u); + const u32 c_24s = rotl32_S ((c_21s ^ c_16s ^ w[10] ^ w[ 8]), 1u); + const u32 c_25s = rotl32_S ((c_22s ^ c_17s ^ w[11] ^ w[ 9]), 1u); + const u32 c_26s = rotl32_S ((c_23s ^ c_18s ^ w[12] ^ w[10]), 1u); + const u32 c_27s = rotl32_S ((c_24s ^ c_19s ^ w[13] ^ w[11]), 1u); + const u32 c_28s = rotl32_S ((c_25s ^ c_20s ^ w[14] ^ w[12]), 1u); + const u32 c_29s = rotl32_S ((c_26s ^ c_21s ^ w[15] ^ w[13]), 1u); + const u32 c_30s = rotl32_S ((c_27s ^ c_22s ^ c_16s ^ w[14]), 1u); + const u32 c_31s = rotl32_S ((c_28s ^ c_23s ^ c_17s ^ w[15]), 1u); + const u32 c_32s = rotl32_S ((c_29s ^ c_24s ^ c_18s ^ c_16s), 1u); + const u32 c_33s = rotl32_S ((c_30s ^ c_25s ^ c_19s ^ c_17s), 1u); + const u32 c_34s = rotl32_S ((c_31s ^ c_26s ^ c_20s ^ c_18s), 1u); + const u32 c_35s = rotl32_S ((c_32s ^ c_27s ^ c_21s ^ c_19s), 1u); + const u32 c_36s = rotl32_S ((c_33s ^ c_28s ^ c_22s ^ c_20s), 1u); + const u32 c_37s = rotl32_S ((c_34s ^ c_29s ^ c_23s ^ c_21s), 1u); + const u32 c_38s = rotl32_S ((c_35s ^ c_30s ^ c_24s ^ c_22s), 1u); + const u32 c_39s = rotl32_S ((c_36s ^ c_31s ^ c_25s ^ c_23s), 1u); + const u32 c_40s = rotl32_S ((c_37s ^ c_32s ^ c_26s ^ c_24s), 1u); + const u32 c_41s = rotl32_S ((c_38s ^ c_33s ^ c_27s ^ c_25s), 1u); + const u32 c_42s = rotl32_S ((c_39s ^ c_34s ^ c_28s ^ c_26s), 1u); + const u32 c_43s = rotl32_S ((c_40s ^ c_35s ^ c_29s ^ c_27s), 1u); + const u32 c_44s = rotl32_S ((c_41s ^ c_36s ^ c_30s ^ c_28s), 1u); + const u32 c_45s = rotl32_S ((c_42s ^ c_37s ^ c_31s ^ c_29s), 1u); + const u32 c_46s = rotl32_S ((c_43s ^ c_38s ^ c_32s ^ c_30s), 1u); + const u32 c_47s = rotl32_S ((c_44s ^ c_39s ^ c_33s ^ c_31s), 1u); + const u32 c_48s = rotl32_S ((c_45s ^ c_40s ^ c_34s ^ c_32s), 1u); + const u32 c_49s = rotl32_S ((c_46s ^ c_41s ^ c_35s ^ c_33s), 1u); + const u32 c_50s = rotl32_S ((c_47s ^ c_42s ^ c_36s ^ c_34s), 1u); + const u32 c_51s = rotl32_S ((c_48s ^ c_43s ^ c_37s ^ c_35s), 1u); + const u32 c_52s = rotl32_S ((c_49s ^ c_44s ^ c_38s ^ c_36s), 1u); + const u32 c_53s = rotl32_S ((c_50s ^ c_45s ^ c_39s ^ c_37s), 1u); + const u32 c_54s = rotl32_S ((c_51s ^ c_46s ^ c_40s ^ c_38s), 1u); + const u32 c_55s = rotl32_S ((c_52s ^ c_47s ^ c_41s ^ c_39s), 1u); + const u32 c_56s = rotl32_S ((c_53s ^ c_48s ^ c_42s ^ c_40s), 1u); + const u32 c_57s = rotl32_S ((c_54s ^ c_49s ^ c_43s ^ c_41s), 1u); + const u32 c_58s = rotl32_S ((c_55s ^ c_50s ^ c_44s ^ c_42s), 1u); + const u32 c_59s = rotl32_S ((c_56s ^ c_51s ^ c_45s ^ c_43s), 1u); + const u32 c_60s = rotl32_S ((c_57s ^ c_52s ^ c_46s ^ c_44s), 1u); + const u32 c_61s = rotl32_S ((c_58s ^ c_53s ^ c_47s ^ c_45s), 1u); + const u32 c_62s = rotl32_S ((c_59s ^ c_54s ^ c_48s ^ c_46s), 1u); + const u32 c_63s = rotl32_S ((c_60s ^ c_55s ^ c_49s ^ c_47s), 1u); + const u32 c_64s = rotl32_S ((c_61s ^ c_56s ^ c_50s ^ c_48s), 1u); + const u32 c_65s = rotl32_S ((c_62s ^ c_57s ^ c_51s ^ c_49s), 1u); + const u32 c_66s = rotl32_S ((c_63s ^ c_58s ^ c_52s ^ c_50s), 1u); + const u32 c_67s = rotl32_S ((c_64s ^ c_59s ^ c_53s ^ c_51s), 1u); + const u32 c_68s = rotl32_S ((c_65s ^ c_60s ^ c_54s ^ c_52s), 1u); + const u32 c_69s = rotl32_S ((c_66s ^ c_61s ^ c_55s ^ c_53s), 1u); + const u32 c_70s = rotl32_S ((c_67s ^ c_62s ^ c_56s ^ c_54s), 1u); + const u32 c_71s = rotl32_S ((c_68s ^ c_63s ^ c_57s ^ c_55s), 1u); + const u32 c_72s = rotl32_S ((c_69s ^ c_64s ^ c_58s ^ c_56s), 1u); + const u32 c_73s = rotl32_S ((c_70s ^ c_65s ^ c_59s ^ c_57s), 1u); + const u32 c_74s = rotl32_S ((c_71s ^ c_66s ^ c_60s ^ c_58s), 1u); + const u32 c_75s = rotl32_S ((c_72s ^ c_67s ^ c_61s ^ c_59s), 1u); const u32 c_17sK = c_17s + SHA1C00; const u32 c_18sK = c_18s + SHA1C00; @@ -376,7 +370,7 @@ static void m00100s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k * reverse */ - const u32 e_rev = rotl32 (search[1], 2u) - SHA1C03; + const u32 e_rev = rotl32_S (search[1], 2u) - SHA1C03; /** * loop @@ -384,43 +378,43 @@ static void m00100s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - const u32 w0s01 = rotl32 (w0, 1u); - const u32 w0s02 = rotl32 (w0, 2u); - const u32 w0s03 = rotl32 (w0, 3u); - const u32 w0s04 = rotl32 (w0, 4u); - const u32 w0s05 = rotl32 (w0, 5u); - const u32 w0s06 = rotl32 (w0, 6u); - const u32 w0s07 = rotl32 (w0, 7u); - const u32 w0s08 = rotl32 (w0, 8u); - const u32 w0s09 = rotl32 (w0, 9u); - const u32 w0s10 = rotl32 (w0, 10u); - const u32 w0s11 = rotl32 (w0, 11u); - const u32 w0s12 = rotl32 (w0, 12u); - const u32 w0s13 = rotl32 (w0, 13u); - const u32 w0s14 = rotl32 (w0, 14u); - const u32 w0s15 = rotl32 (w0, 15u); - const u32 w0s16 = rotl32 (w0, 16u); - const u32 w0s17 = rotl32 (w0, 17u); - const u32 w0s18 = rotl32 (w0, 18u); - const u32 w0s19 = rotl32 (w0, 19u); - const u32 w0s20 = rotl32 (w0, 20u); + const u32x w0s01 = rotl32 (w0, 1u); + const u32x w0s02 = rotl32 (w0, 2u); + const u32x w0s03 = rotl32 (w0, 3u); + const u32x w0s04 = rotl32 (w0, 4u); + const u32x w0s05 = rotl32 (w0, 5u); + const u32x w0s06 = rotl32 (w0, 6u); + const u32x w0s07 = rotl32 (w0, 7u); + const u32x w0s08 = rotl32 (w0, 8u); + const u32x w0s09 = rotl32 (w0, 9u); + const u32x w0s10 = rotl32 (w0, 10u); + const u32x w0s11 = rotl32 (w0, 11u); + const u32x w0s12 = rotl32 (w0, 12u); + const u32x w0s13 = rotl32 (w0, 13u); + const u32x w0s14 = rotl32 (w0, 14u); + const u32x w0s15 = rotl32 (w0, 15u); + const u32x w0s16 = rotl32 (w0, 16u); + const u32x w0s17 = rotl32 (w0, 17u); + const u32x w0s18 = rotl32 (w0, 18u); + const u32x w0s19 = rotl32 (w0, 19u); + const u32x w0s20 = rotl32 (w0, 20u); - const u32 w0s04___w0s06 = w0s04 ^ w0s06; - const u32 w0s04___w0s08 = w0s04 ^ w0s08; - const u32 w0s08___w0s12 = w0s08 ^ w0s12; - const u32 w0s04___w0s06___w0s07 = w0s04___w0s06 ^ w0s07; + const u32x w0s04___w0s06 = w0s04 ^ w0s06; + const u32x w0s04___w0s08 = w0s04 ^ w0s08; + const u32x w0s08___w0s12 = w0s08 ^ w0s12; + const u32x w0s04___w0s06___w0s07 = w0s04___w0s06 ^ w0s07; - u32 a = SHA1M_A; - u32 b = SHA1M_B; - u32 c = SHA1M_C; - u32 d = SHA1M_D; - u32 e = SHA1M_E; + u32x a = SHA1M_A; + u32x b = SHA1M_B; + u32x c = SHA1M_C; + u32x d = SHA1M_D; + u32x e = SHA1M_E; #undef K #define K SHA1C00 @@ -516,36 +510,28 @@ static void m00100s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k SHA1_STEP_PE (SHA1_F1, a, b, c, d, e, (c_75s ^ w0s06 ^ w0s12 ^ w0s14)); - bool q_cond = allx (e_rev != e); - - if (q_cond) continue; + if (MATCHES_NONE_VS (e, e_rev)) continue; SHA1_STEP_PB (SHA1_F1, a, b, c, d, e, 0); - const u32 c_76s = rotl32 ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u); - const u32 c_77s = rotl32 ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u); - const u32 c_78s = rotl32 ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u); - const u32 c_79s = rotl32 ((c_76s ^ c_71s ^ c_65s ^ c_63s), 1u); + const u32x c_76s = rotl32 ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u); + const u32x c_77s = rotl32 ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u); + const u32x c_78s = rotl32 ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u); + const u32x c_79s = rotl32 ((c_76s ^ c_71s ^ c_65s ^ c_63s), 1u); - const u32 w0s21 = rotl32 (w0, 21u); - const u32 w0s22 = rotl32 (w0, 22U); + const u32x w0s21 = rotl32 (w0, 21u); + const u32x w0s22 = rotl32 (w0, 22U); SHA1_STEP (SHA1_F1 , e, a, b, c, d, (c_76s ^ w0s07 ^ w0s08___w0s12 ^ w0s16 ^ w0s21)); SHA1_STEP (SHA1_F1 , d, e, a, b, c, (c_77s)); SHA1_STEP (SHA1_F1 , c, d, e, a, b, (c_78s ^ w0s07 ^ w0s08 ^ w0s15 ^ w0s18 ^ w0s20)); SHA1_STEP (SHA1_F1 , b, c, d, e, a, (c_79s ^ w0s08 ^ w0s22)); - - const u32 r0 = d; - const u32 r1 = e; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_S + COMPARE_S_SIMD (d, e, c, b); } } -__kernel void m00100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m00100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -583,7 +569,7 @@ __kernel void m00100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m00100m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m00100_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m00100_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -621,7 +607,7 @@ __kernel void m00100_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m00100m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m00100_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m00100_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -659,7 +645,7 @@ __kernel void m00100_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m00100m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m00100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m00100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -697,7 +683,7 @@ __kernel void m00100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m00100s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m00100_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m00100_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -735,7 +721,7 @@ __kernel void m00100_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m00100s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m00100_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m00100_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base diff --git a/OpenCL/m00110_a0.cl b/OpenCL/m00110_a0.cl index dc8abe1fb..9e4b1d82f 100644 --- a/OpenCL/m00110_a0.cl +++ b/OpenCL/m00110_a0.cl @@ -142,7 +142,7 @@ __kernel void m00110_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, out_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, out_len); const u32 pw_salt_len = out_len + salt_len; @@ -447,7 +447,7 @@ __kernel void m00110_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, out_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, out_len); const u32 pw_salt_len = out_len + salt_len; diff --git a/OpenCL/m00110_a1.cl b/OpenCL/m00110_a1.cl index 58318ab49..2642ae466 100644 --- a/OpenCL/m00110_a1.cl +++ b/OpenCL/m00110_a1.cl @@ -68,7 +68,7 @@ __kernel void m00110_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -131,7 +131,7 @@ __kernel void m00110_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } /** @@ -166,7 +166,7 @@ __kernel void m00110_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, pw_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len); const u32 pw_salt_len = pw_len + salt_len; @@ -389,7 +389,7 @@ __kernel void m00110_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -470,7 +470,7 @@ __kernel void m00110_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } /** @@ -505,7 +505,7 @@ __kernel void m00110_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, pw_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len); const u32 pw_salt_len = pw_len + salt_len; diff --git a/OpenCL/m00110_a3.cl b/OpenCL/m00110_a3.cl index 71d965677..68a78486f 100644 --- a/OpenCL/m00110_a3.cl +++ b/OpenCL/m00110_a3.cl @@ -5,6 +5,8 @@ #define _SHA1_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,11 +18,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" +#include "OpenCL/simd.c" -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" - -static void m00110m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m00110m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -61,24 +61,24 @@ static void m00110m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k salt_buf3[2] = 0; salt_buf3[3] = 0; - switch_buffer_by_offset (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len); + switch_buffer_by_offset_le_S (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len); - w[ 0] |= swap32 (salt_buf0[0]); - w[ 1] |= swap32 (salt_buf0[1]); - w[ 2] |= swap32 (salt_buf0[2]); - w[ 3] |= swap32 (salt_buf0[3]); - w[ 4] |= swap32 (salt_buf1[0]); - w[ 5] |= swap32 (salt_buf1[1]); - w[ 6] |= swap32 (salt_buf1[2]); - w[ 7] |= swap32 (salt_buf1[3]); - w[ 8] |= swap32 (salt_buf2[0]); - w[ 9] |= swap32 (salt_buf2[1]); - w[10] |= swap32 (salt_buf2[2]); - w[11] |= swap32 (salt_buf2[3]); - w[12] |= swap32 (salt_buf3[0]); - w[13] |= swap32 (salt_buf3[1]); - w[14] |= swap32 (salt_buf3[2]); - w[15] |= swap32 (salt_buf3[3]); + w[ 0] |= swap32_S (salt_buf0[0]); + w[ 1] |= swap32_S (salt_buf0[1]); + w[ 2] |= swap32_S (salt_buf0[2]); + w[ 3] |= swap32_S (salt_buf0[3]); + w[ 4] |= swap32_S (salt_buf1[0]); + w[ 5] |= swap32_S (salt_buf1[1]); + w[ 6] |= swap32_S (salt_buf1[2]); + w[ 7] |= swap32_S (salt_buf1[3]); + w[ 8] |= swap32_S (salt_buf2[0]); + w[ 9] |= swap32_S (salt_buf2[1]); + w[10] |= swap32_S (salt_buf2[2]); + w[11] |= swap32_S (salt_buf2[3]); + w[12] |= swap32_S (salt_buf3[0]); + w[13] |= swap32_S (salt_buf3[1]); + w[14] |= swap32_S (salt_buf3[2]); + w[15] |= swap32_S (salt_buf3[3]); const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -90,70 +90,70 @@ static void m00110m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k * base */ - const u32 c_16s = rotl32 ((w[13] ^ w[ 8] ^ w[ 2] ), 1u); - const u32 c_17s = rotl32 ((w[14] ^ w[ 9] ^ w[ 3] ^ w[ 1]), 1u); - const u32 c_18s = rotl32 ((w[15] ^ w[10] ^ w[ 4] ^ w[ 2]), 1u); - const u32 c_19s = rotl32 ((c_16s ^ w[11] ^ w[ 5] ^ w[ 3]), 1u); - const u32 c_20s = rotl32 ((c_17s ^ w[12] ^ w[ 6] ^ w[ 4]), 1u); - const u32 c_21s = rotl32 ((c_18s ^ w[13] ^ w[ 7] ^ w[ 5]), 1u); - const u32 c_22s = rotl32 ((c_19s ^ w[14] ^ w[ 8] ^ w[ 6]), 1u); - const u32 c_23s = rotl32 ((c_20s ^ w[15] ^ w[ 9] ^ w[ 7]), 1u); - const u32 c_24s = rotl32 ((c_21s ^ c_16s ^ w[10] ^ w[ 8]), 1u); - const u32 c_25s = rotl32 ((c_22s ^ c_17s ^ w[11] ^ w[ 9]), 1u); - const u32 c_26s = rotl32 ((c_23s ^ c_18s ^ w[12] ^ w[10]), 1u); - const u32 c_27s = rotl32 ((c_24s ^ c_19s ^ w[13] ^ w[11]), 1u); - const u32 c_28s = rotl32 ((c_25s ^ c_20s ^ w[14] ^ w[12]), 1u); - const u32 c_29s = rotl32 ((c_26s ^ c_21s ^ w[15] ^ w[13]), 1u); - const u32 c_30s = rotl32 ((c_27s ^ c_22s ^ c_16s ^ w[14]), 1u); - const u32 c_31s = rotl32 ((c_28s ^ c_23s ^ c_17s ^ w[15]), 1u); - const u32 c_32s = rotl32 ((c_29s ^ c_24s ^ c_18s ^ c_16s), 1u); - const u32 c_33s = rotl32 ((c_30s ^ c_25s ^ c_19s ^ c_17s), 1u); - const u32 c_34s = rotl32 ((c_31s ^ c_26s ^ c_20s ^ c_18s), 1u); - const u32 c_35s = rotl32 ((c_32s ^ c_27s ^ c_21s ^ c_19s), 1u); - const u32 c_36s = rotl32 ((c_33s ^ c_28s ^ c_22s ^ c_20s), 1u); - const u32 c_37s = rotl32 ((c_34s ^ c_29s ^ c_23s ^ c_21s), 1u); - const u32 c_38s = rotl32 ((c_35s ^ c_30s ^ c_24s ^ c_22s), 1u); - const u32 c_39s = rotl32 ((c_36s ^ c_31s ^ c_25s ^ c_23s), 1u); - const u32 c_40s = rotl32 ((c_37s ^ c_32s ^ c_26s ^ c_24s), 1u); - const u32 c_41s = rotl32 ((c_38s ^ c_33s ^ c_27s ^ c_25s), 1u); - const u32 c_42s = rotl32 ((c_39s ^ c_34s ^ c_28s ^ c_26s), 1u); - const u32 c_43s = rotl32 ((c_40s ^ c_35s ^ c_29s ^ c_27s), 1u); - const u32 c_44s = rotl32 ((c_41s ^ c_36s ^ c_30s ^ c_28s), 1u); - const u32 c_45s = rotl32 ((c_42s ^ c_37s ^ c_31s ^ c_29s), 1u); - const u32 c_46s = rotl32 ((c_43s ^ c_38s ^ c_32s ^ c_30s), 1u); - const u32 c_47s = rotl32 ((c_44s ^ c_39s ^ c_33s ^ c_31s), 1u); - const u32 c_48s = rotl32 ((c_45s ^ c_40s ^ c_34s ^ c_32s), 1u); - const u32 c_49s = rotl32 ((c_46s ^ c_41s ^ c_35s ^ c_33s), 1u); - const u32 c_50s = rotl32 ((c_47s ^ c_42s ^ c_36s ^ c_34s), 1u); - const u32 c_51s = rotl32 ((c_48s ^ c_43s ^ c_37s ^ c_35s), 1u); - const u32 c_52s = rotl32 ((c_49s ^ c_44s ^ c_38s ^ c_36s), 1u); - const u32 c_53s = rotl32 ((c_50s ^ c_45s ^ c_39s ^ c_37s), 1u); - const u32 c_54s = rotl32 ((c_51s ^ c_46s ^ c_40s ^ c_38s), 1u); - const u32 c_55s = rotl32 ((c_52s ^ c_47s ^ c_41s ^ c_39s), 1u); - const u32 c_56s = rotl32 ((c_53s ^ c_48s ^ c_42s ^ c_40s), 1u); - const u32 c_57s = rotl32 ((c_54s ^ c_49s ^ c_43s ^ c_41s), 1u); - const u32 c_58s = rotl32 ((c_55s ^ c_50s ^ c_44s ^ c_42s), 1u); - const u32 c_59s = rotl32 ((c_56s ^ c_51s ^ c_45s ^ c_43s), 1u); - const u32 c_60s = rotl32 ((c_57s ^ c_52s ^ c_46s ^ c_44s), 1u); - const u32 c_61s = rotl32 ((c_58s ^ c_53s ^ c_47s ^ c_45s), 1u); - const u32 c_62s = rotl32 ((c_59s ^ c_54s ^ c_48s ^ c_46s), 1u); - const u32 c_63s = rotl32 ((c_60s ^ c_55s ^ c_49s ^ c_47s), 1u); - const u32 c_64s = rotl32 ((c_61s ^ c_56s ^ c_50s ^ c_48s), 1u); - const u32 c_65s = rotl32 ((c_62s ^ c_57s ^ c_51s ^ c_49s), 1u); - const u32 c_66s = rotl32 ((c_63s ^ c_58s ^ c_52s ^ c_50s), 1u); - const u32 c_67s = rotl32 ((c_64s ^ c_59s ^ c_53s ^ c_51s), 1u); - const u32 c_68s = rotl32 ((c_65s ^ c_60s ^ c_54s ^ c_52s), 1u); - const u32 c_69s = rotl32 ((c_66s ^ c_61s ^ c_55s ^ c_53s), 1u); - const u32 c_70s = rotl32 ((c_67s ^ c_62s ^ c_56s ^ c_54s), 1u); - const u32 c_71s = rotl32 ((c_68s ^ c_63s ^ c_57s ^ c_55s), 1u); - const u32 c_72s = rotl32 ((c_69s ^ c_64s ^ c_58s ^ c_56s), 1u); - const u32 c_73s = rotl32 ((c_70s ^ c_65s ^ c_59s ^ c_57s), 1u); - const u32 c_74s = rotl32 ((c_71s ^ c_66s ^ c_60s ^ c_58s), 1u); - const u32 c_75s = rotl32 ((c_72s ^ c_67s ^ c_61s ^ c_59s), 1u); - const u32 c_76s = rotl32 ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u); - const u32 c_77s = rotl32 ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u); - const u32 c_78s = rotl32 ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u); - const u32 c_79s = rotl32 ((c_76s ^ c_71s ^ c_65s ^ c_63s), 1u); + const u32 c_16s = rotl32_S ((w[13] ^ w[ 8] ^ w[ 2] ), 1u); + const u32 c_17s = rotl32_S ((w[14] ^ w[ 9] ^ w[ 3] ^ w[ 1]), 1u); + const u32 c_18s = rotl32_S ((w[15] ^ w[10] ^ w[ 4] ^ w[ 2]), 1u); + const u32 c_19s = rotl32_S ((c_16s ^ w[11] ^ w[ 5] ^ w[ 3]), 1u); + const u32 c_20s = rotl32_S ((c_17s ^ w[12] ^ w[ 6] ^ w[ 4]), 1u); + const u32 c_21s = rotl32_S ((c_18s ^ w[13] ^ w[ 7] ^ w[ 5]), 1u); + const u32 c_22s = rotl32_S ((c_19s ^ w[14] ^ w[ 8] ^ w[ 6]), 1u); + const u32 c_23s = rotl32_S ((c_20s ^ w[15] ^ w[ 9] ^ w[ 7]), 1u); + const u32 c_24s = rotl32_S ((c_21s ^ c_16s ^ w[10] ^ w[ 8]), 1u); + const u32 c_25s = rotl32_S ((c_22s ^ c_17s ^ w[11] ^ w[ 9]), 1u); + const u32 c_26s = rotl32_S ((c_23s ^ c_18s ^ w[12] ^ w[10]), 1u); + const u32 c_27s = rotl32_S ((c_24s ^ c_19s ^ w[13] ^ w[11]), 1u); + const u32 c_28s = rotl32_S ((c_25s ^ c_20s ^ w[14] ^ w[12]), 1u); + const u32 c_29s = rotl32_S ((c_26s ^ c_21s ^ w[15] ^ w[13]), 1u); + const u32 c_30s = rotl32_S ((c_27s ^ c_22s ^ c_16s ^ w[14]), 1u); + const u32 c_31s = rotl32_S ((c_28s ^ c_23s ^ c_17s ^ w[15]), 1u); + const u32 c_32s = rotl32_S ((c_29s ^ c_24s ^ c_18s ^ c_16s), 1u); + const u32 c_33s = rotl32_S ((c_30s ^ c_25s ^ c_19s ^ c_17s), 1u); + const u32 c_34s = rotl32_S ((c_31s ^ c_26s ^ c_20s ^ c_18s), 1u); + const u32 c_35s = rotl32_S ((c_32s ^ c_27s ^ c_21s ^ c_19s), 1u); + const u32 c_36s = rotl32_S ((c_33s ^ c_28s ^ c_22s ^ c_20s), 1u); + const u32 c_37s = rotl32_S ((c_34s ^ c_29s ^ c_23s ^ c_21s), 1u); + const u32 c_38s = rotl32_S ((c_35s ^ c_30s ^ c_24s ^ c_22s), 1u); + const u32 c_39s = rotl32_S ((c_36s ^ c_31s ^ c_25s ^ c_23s), 1u); + const u32 c_40s = rotl32_S ((c_37s ^ c_32s ^ c_26s ^ c_24s), 1u); + const u32 c_41s = rotl32_S ((c_38s ^ c_33s ^ c_27s ^ c_25s), 1u); + const u32 c_42s = rotl32_S ((c_39s ^ c_34s ^ c_28s ^ c_26s), 1u); + const u32 c_43s = rotl32_S ((c_40s ^ c_35s ^ c_29s ^ c_27s), 1u); + const u32 c_44s = rotl32_S ((c_41s ^ c_36s ^ c_30s ^ c_28s), 1u); + const u32 c_45s = rotl32_S ((c_42s ^ c_37s ^ c_31s ^ c_29s), 1u); + const u32 c_46s = rotl32_S ((c_43s ^ c_38s ^ c_32s ^ c_30s), 1u); + const u32 c_47s = rotl32_S ((c_44s ^ c_39s ^ c_33s ^ c_31s), 1u); + const u32 c_48s = rotl32_S ((c_45s ^ c_40s ^ c_34s ^ c_32s), 1u); + const u32 c_49s = rotl32_S ((c_46s ^ c_41s ^ c_35s ^ c_33s), 1u); + const u32 c_50s = rotl32_S ((c_47s ^ c_42s ^ c_36s ^ c_34s), 1u); + const u32 c_51s = rotl32_S ((c_48s ^ c_43s ^ c_37s ^ c_35s), 1u); + const u32 c_52s = rotl32_S ((c_49s ^ c_44s ^ c_38s ^ c_36s), 1u); + const u32 c_53s = rotl32_S ((c_50s ^ c_45s ^ c_39s ^ c_37s), 1u); + const u32 c_54s = rotl32_S ((c_51s ^ c_46s ^ c_40s ^ c_38s), 1u); + const u32 c_55s = rotl32_S ((c_52s ^ c_47s ^ c_41s ^ c_39s), 1u); + const u32 c_56s = rotl32_S ((c_53s ^ c_48s ^ c_42s ^ c_40s), 1u); + const u32 c_57s = rotl32_S ((c_54s ^ c_49s ^ c_43s ^ c_41s), 1u); + const u32 c_58s = rotl32_S ((c_55s ^ c_50s ^ c_44s ^ c_42s), 1u); + const u32 c_59s = rotl32_S ((c_56s ^ c_51s ^ c_45s ^ c_43s), 1u); + const u32 c_60s = rotl32_S ((c_57s ^ c_52s ^ c_46s ^ c_44s), 1u); + const u32 c_61s = rotl32_S ((c_58s ^ c_53s ^ c_47s ^ c_45s), 1u); + const u32 c_62s = rotl32_S ((c_59s ^ c_54s ^ c_48s ^ c_46s), 1u); + const u32 c_63s = rotl32_S ((c_60s ^ c_55s ^ c_49s ^ c_47s), 1u); + const u32 c_64s = rotl32_S ((c_61s ^ c_56s ^ c_50s ^ c_48s), 1u); + const u32 c_65s = rotl32_S ((c_62s ^ c_57s ^ c_51s ^ c_49s), 1u); + const u32 c_66s = rotl32_S ((c_63s ^ c_58s ^ c_52s ^ c_50s), 1u); + const u32 c_67s = rotl32_S ((c_64s ^ c_59s ^ c_53s ^ c_51s), 1u); + const u32 c_68s = rotl32_S ((c_65s ^ c_60s ^ c_54s ^ c_52s), 1u); + const u32 c_69s = rotl32_S ((c_66s ^ c_61s ^ c_55s ^ c_53s), 1u); + const u32 c_70s = rotl32_S ((c_67s ^ c_62s ^ c_56s ^ c_54s), 1u); + const u32 c_71s = rotl32_S ((c_68s ^ c_63s ^ c_57s ^ c_55s), 1u); + const u32 c_72s = rotl32_S ((c_69s ^ c_64s ^ c_58s ^ c_56s), 1u); + const u32 c_73s = rotl32_S ((c_70s ^ c_65s ^ c_59s ^ c_57s), 1u); + const u32 c_74s = rotl32_S ((c_71s ^ c_66s ^ c_60s ^ c_58s), 1u); + const u32 c_75s = rotl32_S ((c_72s ^ c_67s ^ c_61s ^ c_59s), 1u); + const u32 c_76s = rotl32_S ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u); + const u32 c_77s = rotl32_S ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u); + const u32 c_78s = rotl32_S ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u); + const u32 c_79s = rotl32_S ((c_76s ^ c_71s ^ c_65s ^ c_63s), 1u); const u32 c_17sK = c_17s + SHA1C00; const u32 c_18sK = c_18s + SHA1C00; @@ -177,45 +177,45 @@ static void m00110m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - const u32 w0s01 = rotl32 (w0, 1u); - const u32 w0s02 = rotl32 (w0, 2u); - const u32 w0s03 = rotl32 (w0, 3u); - const u32 w0s04 = rotl32 (w0, 4u); - const u32 w0s05 = rotl32 (w0, 5u); - const u32 w0s06 = rotl32 (w0, 6u); - const u32 w0s07 = rotl32 (w0, 7u); - const u32 w0s08 = rotl32 (w0, 8u); - const u32 w0s09 = rotl32 (w0, 9u); - const u32 w0s10 = rotl32 (w0, 10u); - const u32 w0s11 = rotl32 (w0, 11u); - const u32 w0s12 = rotl32 (w0, 12u); - const u32 w0s13 = rotl32 (w0, 13u); - const u32 w0s14 = rotl32 (w0, 14u); - const u32 w0s15 = rotl32 (w0, 15u); - const u32 w0s16 = rotl32 (w0, 16u); - const u32 w0s17 = rotl32 (w0, 17u); - const u32 w0s18 = rotl32 (w0, 18u); - const u32 w0s19 = rotl32 (w0, 19u); - const u32 w0s20 = rotl32 (w0, 20u); - const u32 w0s21 = rotl32 (w0, 21u); - const u32 w0s22 = rotl32 (w0, 22U); + const u32x w0s01 = rotl32 (w0, 1u); + const u32x w0s02 = rotl32 (w0, 2u); + const u32x w0s03 = rotl32 (w0, 3u); + const u32x w0s04 = rotl32 (w0, 4u); + const u32x w0s05 = rotl32 (w0, 5u); + const u32x w0s06 = rotl32 (w0, 6u); + const u32x w0s07 = rotl32 (w0, 7u); + const u32x w0s08 = rotl32 (w0, 8u); + const u32x w0s09 = rotl32 (w0, 9u); + const u32x w0s10 = rotl32 (w0, 10u); + const u32x w0s11 = rotl32 (w0, 11u); + const u32x w0s12 = rotl32 (w0, 12u); + const u32x w0s13 = rotl32 (w0, 13u); + const u32x w0s14 = rotl32 (w0, 14u); + const u32x w0s15 = rotl32 (w0, 15u); + const u32x w0s16 = rotl32 (w0, 16u); + const u32x w0s17 = rotl32 (w0, 17u); + const u32x w0s18 = rotl32 (w0, 18u); + const u32x w0s19 = rotl32 (w0, 19u); + const u32x w0s20 = rotl32 (w0, 20u); + const u32x w0s21 = rotl32 (w0, 21u); + const u32x w0s22 = rotl32 (w0, 22U); - const u32 w0s04___w0s06 = w0s04 ^ w0s06; - const u32 w0s04___w0s08 = w0s04 ^ w0s08; - const u32 w0s08___w0s12 = w0s08 ^ w0s12; - const u32 w0s04___w0s06___w0s07 = w0s04___w0s06 ^ w0s07; + const u32x w0s04___w0s06 = w0s04 ^ w0s06; + const u32x w0s04___w0s08 = w0s04 ^ w0s08; + const u32x w0s08___w0s12 = w0s08 ^ w0s12; + const u32x w0s04___w0s06___w0s07 = w0s04___w0s06 ^ w0s07; - u32 a = SHA1M_A; - u32 b = SHA1M_B; - u32 c = SHA1M_C; - u32 d = SHA1M_D; - u32 e = SHA1M_E; + u32x a = SHA1M_A; + u32x b = SHA1M_B; + u32x c = SHA1M_C; + u32x d = SHA1M_D; + u32x e = SHA1M_E; #undef K #define K SHA1C00 @@ -314,17 +314,11 @@ static void m00110m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k SHA1_STEP (SHA1_F1 , c, d, e, a, b, (c_78s ^ w0s07 ^ w0s08 ^ w0s15 ^ w0s18 ^ w0s20)); SHA1_STEP (SHA1_F1 , b, c, d, e, a, (c_79s ^ w0s08 ^ w0s22)); - - const u32 r0 = d; - const u32 r1 = e; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_M + COMPARE_M_SIMD (d, e, c, b); } } -static void m00110s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m00110s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -337,66 +331,66 @@ static void m00110s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k * base */ - const u32 c_16s = rotl32 ((w[13] ^ w[ 8] ^ w[ 2] ), 1u); - const u32 c_17s = rotl32 ((w[14] ^ w[ 9] ^ w[ 3] ^ w[ 1]), 1u); - const u32 c_18s = rotl32 ((w[15] ^ w[10] ^ w[ 4] ^ w[ 2]), 1u); - const u32 c_19s = rotl32 ((c_16s ^ w[11] ^ w[ 5] ^ w[ 3]), 1u); - const u32 c_20s = rotl32 ((c_17s ^ w[12] ^ w[ 6] ^ w[ 4]), 1u); - const u32 c_21s = rotl32 ((c_18s ^ w[13] ^ w[ 7] ^ w[ 5]), 1u); - const u32 c_22s = rotl32 ((c_19s ^ w[14] ^ w[ 8] ^ w[ 6]), 1u); - const u32 c_23s = rotl32 ((c_20s ^ w[15] ^ w[ 9] ^ w[ 7]), 1u); - const u32 c_24s = rotl32 ((c_21s ^ c_16s ^ w[10] ^ w[ 8]), 1u); - const u32 c_25s = rotl32 ((c_22s ^ c_17s ^ w[11] ^ w[ 9]), 1u); - const u32 c_26s = rotl32 ((c_23s ^ c_18s ^ w[12] ^ w[10]), 1u); - const u32 c_27s = rotl32 ((c_24s ^ c_19s ^ w[13] ^ w[11]), 1u); - const u32 c_28s = rotl32 ((c_25s ^ c_20s ^ w[14] ^ w[12]), 1u); - const u32 c_29s = rotl32 ((c_26s ^ c_21s ^ w[15] ^ w[13]), 1u); - const u32 c_30s = rotl32 ((c_27s ^ c_22s ^ c_16s ^ w[14]), 1u); - const u32 c_31s = rotl32 ((c_28s ^ c_23s ^ c_17s ^ w[15]), 1u); - const u32 c_32s = rotl32 ((c_29s ^ c_24s ^ c_18s ^ c_16s), 1u); - const u32 c_33s = rotl32 ((c_30s ^ c_25s ^ c_19s ^ c_17s), 1u); - const u32 c_34s = rotl32 ((c_31s ^ c_26s ^ c_20s ^ c_18s), 1u); - const u32 c_35s = rotl32 ((c_32s ^ c_27s ^ c_21s ^ c_19s), 1u); - const u32 c_36s = rotl32 ((c_33s ^ c_28s ^ c_22s ^ c_20s), 1u); - const u32 c_37s = rotl32 ((c_34s ^ c_29s ^ c_23s ^ c_21s), 1u); - const u32 c_38s = rotl32 ((c_35s ^ c_30s ^ c_24s ^ c_22s), 1u); - const u32 c_39s = rotl32 ((c_36s ^ c_31s ^ c_25s ^ c_23s), 1u); - const u32 c_40s = rotl32 ((c_37s ^ c_32s ^ c_26s ^ c_24s), 1u); - const u32 c_41s = rotl32 ((c_38s ^ c_33s ^ c_27s ^ c_25s), 1u); - const u32 c_42s = rotl32 ((c_39s ^ c_34s ^ c_28s ^ c_26s), 1u); - const u32 c_43s = rotl32 ((c_40s ^ c_35s ^ c_29s ^ c_27s), 1u); - const u32 c_44s = rotl32 ((c_41s ^ c_36s ^ c_30s ^ c_28s), 1u); - const u32 c_45s = rotl32 ((c_42s ^ c_37s ^ c_31s ^ c_29s), 1u); - const u32 c_46s = rotl32 ((c_43s ^ c_38s ^ c_32s ^ c_30s), 1u); - const u32 c_47s = rotl32 ((c_44s ^ c_39s ^ c_33s ^ c_31s), 1u); - const u32 c_48s = rotl32 ((c_45s ^ c_40s ^ c_34s ^ c_32s), 1u); - const u32 c_49s = rotl32 ((c_46s ^ c_41s ^ c_35s ^ c_33s), 1u); - const u32 c_50s = rotl32 ((c_47s ^ c_42s ^ c_36s ^ c_34s), 1u); - const u32 c_51s = rotl32 ((c_48s ^ c_43s ^ c_37s ^ c_35s), 1u); - const u32 c_52s = rotl32 ((c_49s ^ c_44s ^ c_38s ^ c_36s), 1u); - const u32 c_53s = rotl32 ((c_50s ^ c_45s ^ c_39s ^ c_37s), 1u); - const u32 c_54s = rotl32 ((c_51s ^ c_46s ^ c_40s ^ c_38s), 1u); - const u32 c_55s = rotl32 ((c_52s ^ c_47s ^ c_41s ^ c_39s), 1u); - const u32 c_56s = rotl32 ((c_53s ^ c_48s ^ c_42s ^ c_40s), 1u); - const u32 c_57s = rotl32 ((c_54s ^ c_49s ^ c_43s ^ c_41s), 1u); - const u32 c_58s = rotl32 ((c_55s ^ c_50s ^ c_44s ^ c_42s), 1u); - const u32 c_59s = rotl32 ((c_56s ^ c_51s ^ c_45s ^ c_43s), 1u); - const u32 c_60s = rotl32 ((c_57s ^ c_52s ^ c_46s ^ c_44s), 1u); - const u32 c_61s = rotl32 ((c_58s ^ c_53s ^ c_47s ^ c_45s), 1u); - const u32 c_62s = rotl32 ((c_59s ^ c_54s ^ c_48s ^ c_46s), 1u); - const u32 c_63s = rotl32 ((c_60s ^ c_55s ^ c_49s ^ c_47s), 1u); - const u32 c_64s = rotl32 ((c_61s ^ c_56s ^ c_50s ^ c_48s), 1u); - const u32 c_65s = rotl32 ((c_62s ^ c_57s ^ c_51s ^ c_49s), 1u); - const u32 c_66s = rotl32 ((c_63s ^ c_58s ^ c_52s ^ c_50s), 1u); - const u32 c_67s = rotl32 ((c_64s ^ c_59s ^ c_53s ^ c_51s), 1u); - const u32 c_68s = rotl32 ((c_65s ^ c_60s ^ c_54s ^ c_52s), 1u); - const u32 c_69s = rotl32 ((c_66s ^ c_61s ^ c_55s ^ c_53s), 1u); - const u32 c_70s = rotl32 ((c_67s ^ c_62s ^ c_56s ^ c_54s), 1u); - const u32 c_71s = rotl32 ((c_68s ^ c_63s ^ c_57s ^ c_55s), 1u); - const u32 c_72s = rotl32 ((c_69s ^ c_64s ^ c_58s ^ c_56s), 1u); - const u32 c_73s = rotl32 ((c_70s ^ c_65s ^ c_59s ^ c_57s), 1u); - const u32 c_74s = rotl32 ((c_71s ^ c_66s ^ c_60s ^ c_58s), 1u); - const u32 c_75s = rotl32 ((c_72s ^ c_67s ^ c_61s ^ c_59s), 1u); + const u32 c_16s = rotl32_S ((w[13] ^ w[ 8] ^ w[ 2] ), 1u); + const u32 c_17s = rotl32_S ((w[14] ^ w[ 9] ^ w[ 3] ^ w[ 1]), 1u); + const u32 c_18s = rotl32_S ((w[15] ^ w[10] ^ w[ 4] ^ w[ 2]), 1u); + const u32 c_19s = rotl32_S ((c_16s ^ w[11] ^ w[ 5] ^ w[ 3]), 1u); + const u32 c_20s = rotl32_S ((c_17s ^ w[12] ^ w[ 6] ^ w[ 4]), 1u); + const u32 c_21s = rotl32_S ((c_18s ^ w[13] ^ w[ 7] ^ w[ 5]), 1u); + const u32 c_22s = rotl32_S ((c_19s ^ w[14] ^ w[ 8] ^ w[ 6]), 1u); + const u32 c_23s = rotl32_S ((c_20s ^ w[15] ^ w[ 9] ^ w[ 7]), 1u); + const u32 c_24s = rotl32_S ((c_21s ^ c_16s ^ w[10] ^ w[ 8]), 1u); + const u32 c_25s = rotl32_S ((c_22s ^ c_17s ^ w[11] ^ w[ 9]), 1u); + const u32 c_26s = rotl32_S ((c_23s ^ c_18s ^ w[12] ^ w[10]), 1u); + const u32 c_27s = rotl32_S ((c_24s ^ c_19s ^ w[13] ^ w[11]), 1u); + const u32 c_28s = rotl32_S ((c_25s ^ c_20s ^ w[14] ^ w[12]), 1u); + const u32 c_29s = rotl32_S ((c_26s ^ c_21s ^ w[15] ^ w[13]), 1u); + const u32 c_30s = rotl32_S ((c_27s ^ c_22s ^ c_16s ^ w[14]), 1u); + const u32 c_31s = rotl32_S ((c_28s ^ c_23s ^ c_17s ^ w[15]), 1u); + const u32 c_32s = rotl32_S ((c_29s ^ c_24s ^ c_18s ^ c_16s), 1u); + const u32 c_33s = rotl32_S ((c_30s ^ c_25s ^ c_19s ^ c_17s), 1u); + const u32 c_34s = rotl32_S ((c_31s ^ c_26s ^ c_20s ^ c_18s), 1u); + const u32 c_35s = rotl32_S ((c_32s ^ c_27s ^ c_21s ^ c_19s), 1u); + const u32 c_36s = rotl32_S ((c_33s ^ c_28s ^ c_22s ^ c_20s), 1u); + const u32 c_37s = rotl32_S ((c_34s ^ c_29s ^ c_23s ^ c_21s), 1u); + const u32 c_38s = rotl32_S ((c_35s ^ c_30s ^ c_24s ^ c_22s), 1u); + const u32 c_39s = rotl32_S ((c_36s ^ c_31s ^ c_25s ^ c_23s), 1u); + const u32 c_40s = rotl32_S ((c_37s ^ c_32s ^ c_26s ^ c_24s), 1u); + const u32 c_41s = rotl32_S ((c_38s ^ c_33s ^ c_27s ^ c_25s), 1u); + const u32 c_42s = rotl32_S ((c_39s ^ c_34s ^ c_28s ^ c_26s), 1u); + const u32 c_43s = rotl32_S ((c_40s ^ c_35s ^ c_29s ^ c_27s), 1u); + const u32 c_44s = rotl32_S ((c_41s ^ c_36s ^ c_30s ^ c_28s), 1u); + const u32 c_45s = rotl32_S ((c_42s ^ c_37s ^ c_31s ^ c_29s), 1u); + const u32 c_46s = rotl32_S ((c_43s ^ c_38s ^ c_32s ^ c_30s), 1u); + const u32 c_47s = rotl32_S ((c_44s ^ c_39s ^ c_33s ^ c_31s), 1u); + const u32 c_48s = rotl32_S ((c_45s ^ c_40s ^ c_34s ^ c_32s), 1u); + const u32 c_49s = rotl32_S ((c_46s ^ c_41s ^ c_35s ^ c_33s), 1u); + const u32 c_50s = rotl32_S ((c_47s ^ c_42s ^ c_36s ^ c_34s), 1u); + const u32 c_51s = rotl32_S ((c_48s ^ c_43s ^ c_37s ^ c_35s), 1u); + const u32 c_52s = rotl32_S ((c_49s ^ c_44s ^ c_38s ^ c_36s), 1u); + const u32 c_53s = rotl32_S ((c_50s ^ c_45s ^ c_39s ^ c_37s), 1u); + const u32 c_54s = rotl32_S ((c_51s ^ c_46s ^ c_40s ^ c_38s), 1u); + const u32 c_55s = rotl32_S ((c_52s ^ c_47s ^ c_41s ^ c_39s), 1u); + const u32 c_56s = rotl32_S ((c_53s ^ c_48s ^ c_42s ^ c_40s), 1u); + const u32 c_57s = rotl32_S ((c_54s ^ c_49s ^ c_43s ^ c_41s), 1u); + const u32 c_58s = rotl32_S ((c_55s ^ c_50s ^ c_44s ^ c_42s), 1u); + const u32 c_59s = rotl32_S ((c_56s ^ c_51s ^ c_45s ^ c_43s), 1u); + const u32 c_60s = rotl32_S ((c_57s ^ c_52s ^ c_46s ^ c_44s), 1u); + const u32 c_61s = rotl32_S ((c_58s ^ c_53s ^ c_47s ^ c_45s), 1u); + const u32 c_62s = rotl32_S ((c_59s ^ c_54s ^ c_48s ^ c_46s), 1u); + const u32 c_63s = rotl32_S ((c_60s ^ c_55s ^ c_49s ^ c_47s), 1u); + const u32 c_64s = rotl32_S ((c_61s ^ c_56s ^ c_50s ^ c_48s), 1u); + const u32 c_65s = rotl32_S ((c_62s ^ c_57s ^ c_51s ^ c_49s), 1u); + const u32 c_66s = rotl32_S ((c_63s ^ c_58s ^ c_52s ^ c_50s), 1u); + const u32 c_67s = rotl32_S ((c_64s ^ c_59s ^ c_53s ^ c_51s), 1u); + const u32 c_68s = rotl32_S ((c_65s ^ c_60s ^ c_54s ^ c_52s), 1u); + const u32 c_69s = rotl32_S ((c_66s ^ c_61s ^ c_55s ^ c_53s), 1u); + const u32 c_70s = rotl32_S ((c_67s ^ c_62s ^ c_56s ^ c_54s), 1u); + const u32 c_71s = rotl32_S ((c_68s ^ c_63s ^ c_57s ^ c_55s), 1u); + const u32 c_72s = rotl32_S ((c_69s ^ c_64s ^ c_58s ^ c_56s), 1u); + const u32 c_73s = rotl32_S ((c_70s ^ c_65s ^ c_59s ^ c_57s), 1u); + const u32 c_74s = rotl32_S ((c_71s ^ c_66s ^ c_60s ^ c_58s), 1u); + const u32 c_75s = rotl32_S ((c_72s ^ c_67s ^ c_61s ^ c_59s), 1u); const u32 c_17sK = c_17s + SHA1C00; const u32 c_18sK = c_18s + SHA1C00; @@ -430,7 +424,7 @@ static void m00110s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k * reverse */ - const u32 e_rev = rotl32 (search[1], 2u) - SHA1C03; + const u32 e_rev = rotl32_S (search[1], 2u) - SHA1C03; /** * loop @@ -438,43 +432,43 @@ static void m00110s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - const u32 w0s01 = rotl32 (w0, 1u); - const u32 w0s02 = rotl32 (w0, 2u); - const u32 w0s03 = rotl32 (w0, 3u); - const u32 w0s04 = rotl32 (w0, 4u); - const u32 w0s05 = rotl32 (w0, 5u); - const u32 w0s06 = rotl32 (w0, 6u); - const u32 w0s07 = rotl32 (w0, 7u); - const u32 w0s08 = rotl32 (w0, 8u); - const u32 w0s09 = rotl32 (w0, 9u); - const u32 w0s10 = rotl32 (w0, 10u); - const u32 w0s11 = rotl32 (w0, 11u); - const u32 w0s12 = rotl32 (w0, 12u); - const u32 w0s13 = rotl32 (w0, 13u); - const u32 w0s14 = rotl32 (w0, 14u); - const u32 w0s15 = rotl32 (w0, 15u); - const u32 w0s16 = rotl32 (w0, 16u); - const u32 w0s17 = rotl32 (w0, 17u); - const u32 w0s18 = rotl32 (w0, 18u); - const u32 w0s19 = rotl32 (w0, 19u); - const u32 w0s20 = rotl32 (w0, 20u); + const u32x w0s01 = rotl32 (w0, 1u); + const u32x w0s02 = rotl32 (w0, 2u); + const u32x w0s03 = rotl32 (w0, 3u); + const u32x w0s04 = rotl32 (w0, 4u); + const u32x w0s05 = rotl32 (w0, 5u); + const u32x w0s06 = rotl32 (w0, 6u); + const u32x w0s07 = rotl32 (w0, 7u); + const u32x w0s08 = rotl32 (w0, 8u); + const u32x w0s09 = rotl32 (w0, 9u); + const u32x w0s10 = rotl32 (w0, 10u); + const u32x w0s11 = rotl32 (w0, 11u); + const u32x w0s12 = rotl32 (w0, 12u); + const u32x w0s13 = rotl32 (w0, 13u); + const u32x w0s14 = rotl32 (w0, 14u); + const u32x w0s15 = rotl32 (w0, 15u); + const u32x w0s16 = rotl32 (w0, 16u); + const u32x w0s17 = rotl32 (w0, 17u); + const u32x w0s18 = rotl32 (w0, 18u); + const u32x w0s19 = rotl32 (w0, 19u); + const u32x w0s20 = rotl32 (w0, 20u); - const u32 w0s04___w0s06 = w0s04 ^ w0s06; - const u32 w0s04___w0s08 = w0s04 ^ w0s08; - const u32 w0s08___w0s12 = w0s08 ^ w0s12; - const u32 w0s04___w0s06___w0s07 = w0s04___w0s06 ^ w0s07; + const u32x w0s04___w0s06 = w0s04 ^ w0s06; + const u32x w0s04___w0s08 = w0s04 ^ w0s08; + const u32x w0s08___w0s12 = w0s08 ^ w0s12; + const u32x w0s04___w0s06___w0s07 = w0s04___w0s06 ^ w0s07; - u32 a = SHA1M_A; - u32 b = SHA1M_B; - u32 c = SHA1M_C; - u32 d = SHA1M_D; - u32 e = SHA1M_E; + u32x a = SHA1M_A; + u32x b = SHA1M_B; + u32x c = SHA1M_C; + u32x d = SHA1M_D; + u32x e = SHA1M_E; #undef K #define K SHA1C00 @@ -570,36 +564,28 @@ static void m00110s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k SHA1_STEP_PE (SHA1_F1, a, b, c, d, e, (c_75s ^ w0s06 ^ w0s12 ^ w0s14)); - bool q_cond = allx (e_rev != e); - - if (q_cond) continue; + if (MATCHES_NONE_VS (e, e_rev)) continue; SHA1_STEP_PB (SHA1_F1, a, b, c, d, e, 0); - const u32 c_76s = rotl32 ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u); - const u32 c_77s = rotl32 ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u); - const u32 c_78s = rotl32 ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u); - const u32 c_79s = rotl32 ((c_76s ^ c_71s ^ c_65s ^ c_63s), 1u); + const u32x c_76s = rotl32 ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u); + const u32x c_77s = rotl32 ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u); + const u32x c_78s = rotl32 ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u); + const u32x c_79s = rotl32 ((c_76s ^ c_71s ^ c_65s ^ c_63s), 1u); - const u32 w0s21 = rotl32 (w0, 21u); - const u32 w0s22 = rotl32 (w0, 22U); + const u32x w0s21 = rotl32 (w0, 21u); + const u32x w0s22 = rotl32 (w0, 22U); SHA1_STEP (SHA1_F1 , e, a, b, c, d, (c_76s ^ w0s07 ^ w0s08___w0s12 ^ w0s16 ^ w0s21)); SHA1_STEP (SHA1_F1 , d, e, a, b, c, (c_77s)); SHA1_STEP (SHA1_F1 , c, d, e, a, b, (c_78s ^ w0s07 ^ w0s08 ^ w0s15 ^ w0s18 ^ w0s20)); SHA1_STEP (SHA1_F1 , b, c, d, e, a, (c_79s ^ w0s08 ^ w0s22)); - - const u32 r0 = d; - const u32 r1 = e; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_S + COMPARE_S_SIMD (d, e, c, b); } } -__kernel void m00110_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m00110_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -637,7 +623,7 @@ __kernel void m00110_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m00110m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m00110_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m00110_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -675,7 +661,7 @@ __kernel void m00110_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m00110m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m00110_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m00110_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -713,7 +699,7 @@ __kernel void m00110_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m00110m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m00110_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m00110_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -751,7 +737,7 @@ __kernel void m00110_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m00110s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m00110_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m00110_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -789,7 +775,7 @@ __kernel void m00110_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m00110s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m00110_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m00110_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base diff --git a/OpenCL/m00120_a0.cl b/OpenCL/m00120_a0.cl index c1360e61a..f8f130b71 100644 --- a/OpenCL/m00120_a0.cl +++ b/OpenCL/m00120_a0.cl @@ -132,7 +132,7 @@ __kernel void m00120_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w3_t[2] = w3[2]; w3_t[3] = w3[3]; - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w0_t[0] |= salt_buf0[0]; w0_t[1] |= salt_buf0[1]; @@ -416,7 +416,7 @@ __kernel void m00120_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w3_t[2] = w3[2]; w3_t[3] = w3[3]; - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w0_t[0] |= salt_buf0[0]; w0_t[1] |= salt_buf0[1]; diff --git a/OpenCL/m00120_a1.cl b/OpenCL/m00120_a1.cl index babf47a78..8adc5c4fc 100644 --- a/OpenCL/m00120_a1.cl +++ b/OpenCL/m00120_a1.cl @@ -68,7 +68,7 @@ __kernel void m00120_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -125,7 +125,7 @@ __kernel void m00120_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -178,7 +178,7 @@ __kernel void m00120_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w3_t[2] = w3[2]; w3_t[3] = w3[3]; - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w0_t[0] |= salt_buf0[0]; w0_t[1] |= salt_buf0[1]; @@ -382,7 +382,7 @@ __kernel void m00120_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -457,7 +457,7 @@ __kernel void m00120_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -510,7 +510,7 @@ __kernel void m00120_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w3_t[2] = w3[2]; w3_t[3] = w3[3]; - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w0_t[0] |= salt_buf0[0]; w0_t[1] |= salt_buf0[1]; diff --git a/OpenCL/m00120_a3.cl b/OpenCL/m00120_a3.cl index c9dbea8ad..de239dbe1 100644 --- a/OpenCL/m00120_a3.cl +++ b/OpenCL/m00120_a3.cl @@ -5,6 +5,8 @@ #define _SHA1_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,104 +18,7 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" - -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" - -static void overwrite_at (u32 sw[16], const u32 w0, const u32 salt_len) -{ - switch (salt_len) - { - case 0: sw[0] = w0; - break; - case 1: sw[0] = (sw[0] & 0xff000000) | (w0 >> 8); - sw[1] = (sw[1] & 0x00ffffff) | (w0 << 24); - break; - case 2: sw[0] = (sw[0] & 0xffff0000) | (w0 >> 16); - sw[1] = (sw[1] & 0x0000ffff) | (w0 << 16); - break; - case 3: sw[0] = (sw[0] & 0xffffff00) | (w0 >> 24); - sw[1] = (sw[1] & 0x000000ff) | (w0 << 8); - break; - case 4: sw[1] = w0; - break; - case 5: sw[1] = (sw[1] & 0xff000000) | (w0 >> 8); - sw[2] = (sw[2] & 0x00ffffff) | (w0 << 24); - break; - case 6: sw[1] = (sw[1] & 0xffff0000) | (w0 >> 16); - sw[2] = (sw[2] & 0x0000ffff) | (w0 << 16); - break; - case 7: sw[1] = (sw[1] & 0xffffff00) | (w0 >> 24); - sw[2] = (sw[2] & 0x000000ff) | (w0 << 8); - break; - case 8: sw[2] = w0; - break; - case 9: sw[2] = (sw[2] & 0xff000000) | (w0 >> 8); - sw[3] = (sw[3] & 0x00ffffff) | (w0 << 24); - break; - case 10: sw[2] = (sw[2] & 0xffff0000) | (w0 >> 16); - sw[3] = (sw[3] & 0x0000ffff) | (w0 << 16); - break; - case 11: sw[2] = (sw[2] & 0xffffff00) | (w0 >> 24); - sw[3] = (sw[3] & 0x000000ff) | (w0 << 8); - break; - case 12: sw[3] = w0; - break; - case 13: sw[3] = (sw[3] & 0xff000000) | (w0 >> 8); - sw[4] = (sw[4] & 0x00ffffff) | (w0 << 24); - break; - case 14: sw[3] = (sw[3] & 0xffff0000) | (w0 >> 16); - sw[4] = (sw[4] & 0x0000ffff) | (w0 << 16); - break; - case 15: sw[3] = (sw[3] & 0xffffff00) | (w0 >> 24); - sw[4] = (sw[4] & 0x000000ff) | (w0 << 8); - break; - case 16: sw[4] = w0; - break; - case 17: sw[4] = (sw[4] & 0xff000000) | (w0 >> 8); - sw[5] = (sw[5] & 0x00ffffff) | (w0 << 24); - break; - case 18: sw[4] = (sw[4] & 0xffff0000) | (w0 >> 16); - sw[5] = (sw[5] & 0x0000ffff) | (w0 << 16); - break; - case 19: sw[4] = (sw[4] & 0xffffff00) | (w0 >> 24); - sw[5] = (sw[5] & 0x000000ff) | (w0 << 8); - break; - case 20: sw[5] = w0; - break; - case 21: sw[5] = (sw[5] & 0xff000000) | (w0 >> 8); - sw[6] = (sw[6] & 0x00ffffff) | (w0 << 24); - break; - case 22: sw[5] = (sw[5] & 0xffff0000) | (w0 >> 16); - sw[6] = (sw[6] & 0x0000ffff) | (w0 << 16); - break; - case 23: sw[5] = (sw[5] & 0xffffff00) | (w0 >> 24); - sw[6] = (sw[6] & 0x000000ff) | (w0 << 8); - break; - case 24: sw[6] = w0; - break; - case 25: sw[6] = (sw[6] & 0xff000000) | (w0 >> 8); - sw[7] = (sw[7] & 0x00ffffff) | (w0 << 24); - break; - case 26: sw[6] = (sw[6] & 0xffff0000) | (w0 >> 16); - sw[7] = (sw[7] & 0x0000ffff) | (w0 << 16); - break; - case 27: sw[6] = (sw[6] & 0xffffff00) | (w0 >> 24); - sw[7] = (sw[7] & 0x000000ff) | (w0 << 8); - break; - case 28: sw[7] = w0; - break; - case 29: sw[7] = (sw[7] & 0xff000000) | (w0 >> 8); - sw[8] = (sw[8] & 0x00ffffff) | (w0 << 24); - break; - case 30: sw[7] = (sw[7] & 0xffff0000) | (w0 >> 16); - sw[8] = (sw[8] & 0x0000ffff) | (w0 << 16); - break; - case 31: sw[7] = (sw[7] & 0xffffff00) | (w0 >> 24); - sw[8] = (sw[8] & 0x000000ff) | (w0 << 8); - break; - } -} +#include "OpenCL/simd.c" static void m00120m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { @@ -169,24 +74,24 @@ static void m00120m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w2_t[4]; u32 w3_t[4]; - w0_t[0] = swap32 (w0[0]); - w0_t[1] = swap32 (w0[1]); - w0_t[2] = swap32 (w0[2]); - w0_t[3] = swap32 (w0[3]); - w1_t[0] = swap32 (w1[0]); - w1_t[1] = swap32 (w1[1]); - w1_t[2] = swap32 (w1[2]); - w1_t[3] = swap32 (w1[3]); - w2_t[0] = swap32 (w2[0]); - w2_t[1] = swap32 (w2[1]); - w2_t[2] = swap32 (w2[2]); - w2_t[3] = swap32 (w2[3]); - w3_t[0] = swap32 (w3[0]); - w3_t[1] = swap32 (w3[1]); - w3_t[2] = swap32 (w3[2]); - w3_t[3] = swap32 (w3[3]); + w0_t[0] = swap32_S (w0[0]); + w0_t[1] = swap32_S (w0[1]); + w0_t[2] = swap32_S (w0[2]); + w0_t[3] = swap32_S (w0[3]); + w1_t[0] = swap32_S (w1[0]); + w1_t[1] = swap32_S (w1[1]); + w1_t[2] = swap32_S (w1[2]); + w1_t[3] = swap32_S (w1[3]); + w2_t[0] = swap32_S (w2[0]); + w2_t[1] = swap32_S (w2[1]); + w2_t[2] = swap32_S (w2[2]); + w2_t[3] = swap32_S (w2[3]); + w3_t[0] = swap32_S (w3[0]); + w3_t[1] = swap32_S (w3[1]); + w3_t[2] = swap32_S (w3[2]); + w3_t[3] = swap32_S (w3[3]); - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le_S (w0_t, w1_t, w2_t, w3_t, salt_len); w0_t[0] |= salt_buf0[0]; w0_t[1] |= salt_buf0[1]; @@ -205,22 +110,22 @@ static void m00120m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w3_t[2] |= salt_buf3[2]; w3_t[3] |= salt_buf3[3]; - w0_t[0] = swap32 (w0_t[0]); - w0_t[1] = swap32 (w0_t[1]); - w0_t[2] = swap32 (w0_t[2]); - w0_t[3] = swap32 (w0_t[3]); - w1_t[0] = swap32 (w1_t[0]); - w1_t[1] = swap32 (w1_t[1]); - w1_t[2] = swap32 (w1_t[2]); - w1_t[3] = swap32 (w1_t[3]); - w2_t[0] = swap32 (w2_t[0]); - w2_t[1] = swap32 (w2_t[1]); - w2_t[2] = swap32 (w2_t[2]); - w2_t[3] = swap32 (w2_t[3]); - w3_t[0] = swap32 (w3_t[0]); - w3_t[1] = swap32 (w3_t[1]); - w3_t[2] = swap32 (w3_t[2]); - w3_t[3] = swap32 (w3_t[3]); + w0_t[0] = swap32_S (w0_t[0]); + w0_t[1] = swap32_S (w0_t[1]); + w0_t[2] = swap32_S (w0_t[2]); + w0_t[3] = swap32_S (w0_t[3]); + w1_t[0] = swap32_S (w1_t[0]); + w1_t[1] = swap32_S (w1_t[1]); + w1_t[2] = swap32_S (w1_t[2]); + w1_t[3] = swap32_S (w1_t[3]); + w2_t[0] = swap32_S (w2_t[0]); + w2_t[1] = swap32_S (w2_t[1]); + w2_t[2] = swap32_S (w2_t[2]); + w2_t[3] = swap32_S (w2_t[3]); + w3_t[0] = swap32_S (w3_t[0]); + w3_t[1] = swap32_S (w3_t[1]); + w3_t[2] = swap32_S (w3_t[2]); + w3_t[3] = swap32_S (w3_t[3]); /** * loop @@ -228,13 +133,13 @@ static void m00120m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - const u32 w0n = w0l | w0r; + const u32x w0lr = w0l | w0r; - u32 wx[16]; + u32x wx[16]; wx[ 0] = w0_t[0]; wx[ 1] = w0_t[1]; @@ -253,12 +158,12 @@ static void m00120m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le wx[14] = w3_t[2]; wx[15] = w3_t[3]; - overwrite_at (wx, w0n, salt_len); + overwrite_at_be (wx, w0lr, salt_len); - u32 w0_t[4]; - u32 w1_t[4]; - u32 w2_t[4]; - u32 w3_t[4]; + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; w0_t[0] = wx[ 0]; w0_t[1] = wx[ 1]; @@ -281,11 +186,11 @@ static void m00120m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * sha1 */ - u32 a = SHA1M_A; - u32 b = SHA1M_B; - u32 c = SHA1M_C; - u32 d = SHA1M_D; - u32 e = SHA1M_E; + u32x a = SHA1M_A; + u32x b = SHA1M_B; + u32x c = SHA1M_C; + u32x d = SHA1M_D; + u32x e = SHA1M_E; #undef K #define K SHA1C00 @@ -383,12 +288,7 @@ static void m00120m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w3_t[2]); w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w3_t[3]); - const u32 r0 = d; - const u32 r1 = e; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_M + COMPARE_M_SIMD (d, e, c, b); } } @@ -417,7 +317,7 @@ static void m00120s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * reverse */ - const u32 e_rev = rotl32 (search[1], 2u); + const u32 e_rev = rotl32_S (search[1], 2u); /** * salt @@ -464,24 +364,24 @@ static void m00120s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w2_t[4]; u32 w3_t[4]; - w0_t[0] = swap32 (w0[0]); - w0_t[1] = swap32 (w0[1]); - w0_t[2] = swap32 (w0[2]); - w0_t[3] = swap32 (w0[3]); - w1_t[0] = swap32 (w1[0]); - w1_t[1] = swap32 (w1[1]); - w1_t[2] = swap32 (w1[2]); - w1_t[3] = swap32 (w1[3]); - w2_t[0] = swap32 (w2[0]); - w2_t[1] = swap32 (w2[1]); - w2_t[2] = swap32 (w2[2]); - w2_t[3] = swap32 (w2[3]); - w3_t[0] = swap32 (w3[0]); - w3_t[1] = swap32 (w3[1]); - w3_t[2] = swap32 (w3[2]); - w3_t[3] = swap32 (w3[3]); + w0_t[0] = swap32_S (w0[0]); + w0_t[1] = swap32_S (w0[1]); + w0_t[2] = swap32_S (w0[2]); + w0_t[3] = swap32_S (w0[3]); + w1_t[0] = swap32_S (w1[0]); + w1_t[1] = swap32_S (w1[1]); + w1_t[2] = swap32_S (w1[2]); + w1_t[3] = swap32_S (w1[3]); + w2_t[0] = swap32_S (w2[0]); + w2_t[1] = swap32_S (w2[1]); + w2_t[2] = swap32_S (w2[2]); + w2_t[3] = swap32_S (w2[3]); + w3_t[0] = swap32_S (w3[0]); + w3_t[1] = swap32_S (w3[1]); + w3_t[2] = swap32_S (w3[2]); + w3_t[3] = swap32_S (w3[3]); - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le_S (w0_t, w1_t, w2_t, w3_t, salt_len); w0_t[0] |= salt_buf0[0]; w0_t[1] |= salt_buf0[1]; @@ -500,22 +400,22 @@ static void m00120s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w3_t[2] |= salt_buf3[2]; w3_t[3] |= salt_buf3[3]; - w0_t[0] = swap32 (w0_t[0]); - w0_t[1] = swap32 (w0_t[1]); - w0_t[2] = swap32 (w0_t[2]); - w0_t[3] = swap32 (w0_t[3]); - w1_t[0] = swap32 (w1_t[0]); - w1_t[1] = swap32 (w1_t[1]); - w1_t[2] = swap32 (w1_t[2]); - w1_t[3] = swap32 (w1_t[3]); - w2_t[0] = swap32 (w2_t[0]); - w2_t[1] = swap32 (w2_t[1]); - w2_t[2] = swap32 (w2_t[2]); - w2_t[3] = swap32 (w2_t[3]); - w3_t[0] = swap32 (w3_t[0]); - w3_t[1] = swap32 (w3_t[1]); - w3_t[2] = swap32 (w3_t[2]); - w3_t[3] = swap32 (w3_t[3]); + w0_t[0] = swap32_S (w0_t[0]); + w0_t[1] = swap32_S (w0_t[1]); + w0_t[2] = swap32_S (w0_t[2]); + w0_t[3] = swap32_S (w0_t[3]); + w1_t[0] = swap32_S (w1_t[0]); + w1_t[1] = swap32_S (w1_t[1]); + w1_t[2] = swap32_S (w1_t[2]); + w1_t[3] = swap32_S (w1_t[3]); + w2_t[0] = swap32_S (w2_t[0]); + w2_t[1] = swap32_S (w2_t[1]); + w2_t[2] = swap32_S (w2_t[2]); + w2_t[3] = swap32_S (w2_t[3]); + w3_t[0] = swap32_S (w3_t[0]); + w3_t[1] = swap32_S (w3_t[1]); + w3_t[2] = swap32_S (w3_t[2]); + w3_t[3] = swap32_S (w3_t[3]); /** * loop @@ -523,13 +423,13 @@ static void m00120s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - const u32 w0n = w0l | w0r; + const u32x w0lr = w0l | w0r; - u32 wx[16]; + u32x wx[16]; wx[ 0] = w0_t[0]; wx[ 1] = w0_t[1]; @@ -548,12 +448,12 @@ static void m00120s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le wx[14] = w3_t[2]; wx[15] = w3_t[3]; - overwrite_at (wx, w0n, salt_len); + overwrite_at_be (wx, w0lr, salt_len); - u32 w0_t[4]; - u32 w1_t[4]; - u32 w2_t[4]; - u32 w3_t[4]; + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; w0_t[0] = wx[ 0]; w0_t[1] = wx[ 1]; @@ -576,11 +476,11 @@ static void m00120s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * sha1 */ - u32 a = SHA1M_A; - u32 b = SHA1M_B; - u32 c = SHA1M_C; - u32 d = SHA1M_D; - u32 e = SHA1M_E; + u32x a = SHA1M_A; + u32x b = SHA1M_B; + u32x c = SHA1M_C; + u32x d = SHA1M_D; + u32x e = SHA1M_E; #undef K #define K SHA1C00 @@ -675,19 +575,13 @@ static void m00120s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w2_t[3] = rotl32 ((w2_t[0] ^ w0_t[3] ^ w3_t[1] ^ w2_t[3]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w2_t[3]); w3_t[0] = rotl32 ((w2_t[1] ^ w1_t[0] ^ w3_t[2] ^ w3_t[0]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w3_t[0]); - if (allx (e != e_rev)) continue; + if (MATCHES_NONE_VS (e, e_rev)) continue; w3_t[1] = rotl32 ((w2_t[2] ^ w1_t[1] ^ w3_t[3] ^ w3_t[1]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w3_t[1]); w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w3_t[2]); w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w3_t[3]); - - const u32 r0 = d; - const u32 r1 = e; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_S + COMPARE_S_SIMD (d, e, c, b); } } diff --git a/OpenCL/m00130_a0.cl b/OpenCL/m00130_a0.cl index 4abb9c422..493883726 100644 --- a/OpenCL/m00130_a0.cl +++ b/OpenCL/m00130_a0.cl @@ -144,7 +144,7 @@ __kernel void m00130_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, (out_len * 2)); + switch_buffer_by_offset_le (s0, s1, s2, s3, (out_len * 2)); const u32 out_salt_len = (out_len * 2) + salt_len; @@ -456,7 +456,7 @@ __kernel void m00130_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, (out_len * 2)); + switch_buffer_by_offset_le (s0, s1, s2, s3, (out_len * 2)); const u32 out_salt_len = (out_len * 2) + salt_len; diff --git a/OpenCL/m00130_a1.cl b/OpenCL/m00130_a1.cl index 7c28ddcab..58eae9598 100644 --- a/OpenCL/m00130_a1.cl +++ b/OpenCL/m00130_a1.cl @@ -70,7 +70,7 @@ __kernel void m00130_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -133,7 +133,7 @@ __kernel void m00130_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } /** @@ -168,7 +168,7 @@ __kernel void m00130_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, (pw_len * 2)); + switch_buffer_by_offset_le (s0, s1, s2, s3, (pw_len * 2)); const u32 pw_salt_len = (pw_len * 2) + salt_len; @@ -412,7 +412,7 @@ __kernel void m00130_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -493,7 +493,7 @@ __kernel void m00130_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } /** @@ -528,7 +528,7 @@ __kernel void m00130_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, (pw_len * 2)); + switch_buffer_by_offset_le (s0, s1, s2, s3, (pw_len * 2)); const u32 pw_salt_len = (pw_len * 2) + salt_len; diff --git a/OpenCL/m00130_a3.cl b/OpenCL/m00130_a3.cl index 90d7f9323..e405dbd5e 100644 --- a/OpenCL/m00130_a3.cl +++ b/OpenCL/m00130_a3.cl @@ -5,6 +5,8 @@ #define _SHA1_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,11 +18,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" +#include "OpenCL/simd.c" -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" - -static void m00130m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m00130m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -61,24 +61,24 @@ static void m00130m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k salt_buf3[2] = 0; salt_buf3[3] = 0; - switch_buffer_by_offset (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len); + switch_buffer_by_offset_le_S (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len); - w[ 0] |= swap32 (salt_buf0[0]); - w[ 1] |= swap32 (salt_buf0[1]); - w[ 2] |= swap32 (salt_buf0[2]); - w[ 3] |= swap32 (salt_buf0[3]); - w[ 4] |= swap32 (salt_buf1[0]); - w[ 5] |= swap32 (salt_buf1[1]); - w[ 6] |= swap32 (salt_buf1[2]); - w[ 7] |= swap32 (salt_buf1[3]); - w[ 8] |= swap32 (salt_buf2[0]); - w[ 9] |= swap32 (salt_buf2[1]); - w[10] |= swap32 (salt_buf2[2]); - w[11] |= swap32 (salt_buf2[3]); - w[12] |= swap32 (salt_buf3[0]); - w[13] |= swap32 (salt_buf3[1]); - w[14] |= swap32 (salt_buf3[2]); - w[15] |= swap32 (salt_buf3[3]); + w[ 0] |= swap32_S (salt_buf0[0]); + w[ 1] |= swap32_S (salt_buf0[1]); + w[ 2] |= swap32_S (salt_buf0[2]); + w[ 3] |= swap32_S (salt_buf0[3]); + w[ 4] |= swap32_S (salt_buf1[0]); + w[ 5] |= swap32_S (salt_buf1[1]); + w[ 6] |= swap32_S (salt_buf1[2]); + w[ 7] |= swap32_S (salt_buf1[3]); + w[ 8] |= swap32_S (salt_buf2[0]); + w[ 9] |= swap32_S (salt_buf2[1]); + w[10] |= swap32_S (salt_buf2[2]); + w[11] |= swap32_S (salt_buf2[3]); + w[12] |= swap32_S (salt_buf3[0]); + w[13] |= swap32_S (salt_buf3[1]); + w[14] |= swap32_S (salt_buf3[2]); + w[15] |= swap32_S (salt_buf3[3]); const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -90,70 +90,70 @@ static void m00130m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k * base */ - const u32 c_16s = rotl32 ((w[13] ^ w[ 8] ^ w[ 2] ), 1u); - const u32 c_17s = rotl32 ((w[14] ^ w[ 9] ^ w[ 3] ^ w[ 1]), 1u); - const u32 c_18s = rotl32 ((w[15] ^ w[10] ^ w[ 4] ^ w[ 2]), 1u); - const u32 c_19s = rotl32 ((c_16s ^ w[11] ^ w[ 5] ^ w[ 3]), 1u); - const u32 c_20s = rotl32 ((c_17s ^ w[12] ^ w[ 6] ^ w[ 4]), 1u); - const u32 c_21s = rotl32 ((c_18s ^ w[13] ^ w[ 7] ^ w[ 5]), 1u); - const u32 c_22s = rotl32 ((c_19s ^ w[14] ^ w[ 8] ^ w[ 6]), 1u); - const u32 c_23s = rotl32 ((c_20s ^ w[15] ^ w[ 9] ^ w[ 7]), 1u); - const u32 c_24s = rotl32 ((c_21s ^ c_16s ^ w[10] ^ w[ 8]), 1u); - const u32 c_25s = rotl32 ((c_22s ^ c_17s ^ w[11] ^ w[ 9]), 1u); - const u32 c_26s = rotl32 ((c_23s ^ c_18s ^ w[12] ^ w[10]), 1u); - const u32 c_27s = rotl32 ((c_24s ^ c_19s ^ w[13] ^ w[11]), 1u); - const u32 c_28s = rotl32 ((c_25s ^ c_20s ^ w[14] ^ w[12]), 1u); - const u32 c_29s = rotl32 ((c_26s ^ c_21s ^ w[15] ^ w[13]), 1u); - const u32 c_30s = rotl32 ((c_27s ^ c_22s ^ c_16s ^ w[14]), 1u); - const u32 c_31s = rotl32 ((c_28s ^ c_23s ^ c_17s ^ w[15]), 1u); - const u32 c_32s = rotl32 ((c_29s ^ c_24s ^ c_18s ^ c_16s), 1u); - const u32 c_33s = rotl32 ((c_30s ^ c_25s ^ c_19s ^ c_17s), 1u); - const u32 c_34s = rotl32 ((c_31s ^ c_26s ^ c_20s ^ c_18s), 1u); - const u32 c_35s = rotl32 ((c_32s ^ c_27s ^ c_21s ^ c_19s), 1u); - const u32 c_36s = rotl32 ((c_33s ^ c_28s ^ c_22s ^ c_20s), 1u); - const u32 c_37s = rotl32 ((c_34s ^ c_29s ^ c_23s ^ c_21s), 1u); - const u32 c_38s = rotl32 ((c_35s ^ c_30s ^ c_24s ^ c_22s), 1u); - const u32 c_39s = rotl32 ((c_36s ^ c_31s ^ c_25s ^ c_23s), 1u); - const u32 c_40s = rotl32 ((c_37s ^ c_32s ^ c_26s ^ c_24s), 1u); - const u32 c_41s = rotl32 ((c_38s ^ c_33s ^ c_27s ^ c_25s), 1u); - const u32 c_42s = rotl32 ((c_39s ^ c_34s ^ c_28s ^ c_26s), 1u); - const u32 c_43s = rotl32 ((c_40s ^ c_35s ^ c_29s ^ c_27s), 1u); - const u32 c_44s = rotl32 ((c_41s ^ c_36s ^ c_30s ^ c_28s), 1u); - const u32 c_45s = rotl32 ((c_42s ^ c_37s ^ c_31s ^ c_29s), 1u); - const u32 c_46s = rotl32 ((c_43s ^ c_38s ^ c_32s ^ c_30s), 1u); - const u32 c_47s = rotl32 ((c_44s ^ c_39s ^ c_33s ^ c_31s), 1u); - const u32 c_48s = rotl32 ((c_45s ^ c_40s ^ c_34s ^ c_32s), 1u); - const u32 c_49s = rotl32 ((c_46s ^ c_41s ^ c_35s ^ c_33s), 1u); - const u32 c_50s = rotl32 ((c_47s ^ c_42s ^ c_36s ^ c_34s), 1u); - const u32 c_51s = rotl32 ((c_48s ^ c_43s ^ c_37s ^ c_35s), 1u); - const u32 c_52s = rotl32 ((c_49s ^ c_44s ^ c_38s ^ c_36s), 1u); - const u32 c_53s = rotl32 ((c_50s ^ c_45s ^ c_39s ^ c_37s), 1u); - const u32 c_54s = rotl32 ((c_51s ^ c_46s ^ c_40s ^ c_38s), 1u); - const u32 c_55s = rotl32 ((c_52s ^ c_47s ^ c_41s ^ c_39s), 1u); - const u32 c_56s = rotl32 ((c_53s ^ c_48s ^ c_42s ^ c_40s), 1u); - const u32 c_57s = rotl32 ((c_54s ^ c_49s ^ c_43s ^ c_41s), 1u); - const u32 c_58s = rotl32 ((c_55s ^ c_50s ^ c_44s ^ c_42s), 1u); - const u32 c_59s = rotl32 ((c_56s ^ c_51s ^ c_45s ^ c_43s), 1u); - const u32 c_60s = rotl32 ((c_57s ^ c_52s ^ c_46s ^ c_44s), 1u); - const u32 c_61s = rotl32 ((c_58s ^ c_53s ^ c_47s ^ c_45s), 1u); - const u32 c_62s = rotl32 ((c_59s ^ c_54s ^ c_48s ^ c_46s), 1u); - const u32 c_63s = rotl32 ((c_60s ^ c_55s ^ c_49s ^ c_47s), 1u); - const u32 c_64s = rotl32 ((c_61s ^ c_56s ^ c_50s ^ c_48s), 1u); - const u32 c_65s = rotl32 ((c_62s ^ c_57s ^ c_51s ^ c_49s), 1u); - const u32 c_66s = rotl32 ((c_63s ^ c_58s ^ c_52s ^ c_50s), 1u); - const u32 c_67s = rotl32 ((c_64s ^ c_59s ^ c_53s ^ c_51s), 1u); - const u32 c_68s = rotl32 ((c_65s ^ c_60s ^ c_54s ^ c_52s), 1u); - const u32 c_69s = rotl32 ((c_66s ^ c_61s ^ c_55s ^ c_53s), 1u); - const u32 c_70s = rotl32 ((c_67s ^ c_62s ^ c_56s ^ c_54s), 1u); - const u32 c_71s = rotl32 ((c_68s ^ c_63s ^ c_57s ^ c_55s), 1u); - const u32 c_72s = rotl32 ((c_69s ^ c_64s ^ c_58s ^ c_56s), 1u); - const u32 c_73s = rotl32 ((c_70s ^ c_65s ^ c_59s ^ c_57s), 1u); - const u32 c_74s = rotl32 ((c_71s ^ c_66s ^ c_60s ^ c_58s), 1u); - const u32 c_75s = rotl32 ((c_72s ^ c_67s ^ c_61s ^ c_59s), 1u); - const u32 c_76s = rotl32 ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u); - const u32 c_77s = rotl32 ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u); - const u32 c_78s = rotl32 ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u); - const u32 c_79s = rotl32 ((c_76s ^ c_71s ^ c_65s ^ c_63s), 1u); + const u32 c_16s = rotl32_S ((w[13] ^ w[ 8] ^ w[ 2] ), 1u); + const u32 c_17s = rotl32_S ((w[14] ^ w[ 9] ^ w[ 3] ^ w[ 1]), 1u); + const u32 c_18s = rotl32_S ((w[15] ^ w[10] ^ w[ 4] ^ w[ 2]), 1u); + const u32 c_19s = rotl32_S ((c_16s ^ w[11] ^ w[ 5] ^ w[ 3]), 1u); + const u32 c_20s = rotl32_S ((c_17s ^ w[12] ^ w[ 6] ^ w[ 4]), 1u); + const u32 c_21s = rotl32_S ((c_18s ^ w[13] ^ w[ 7] ^ w[ 5]), 1u); + const u32 c_22s = rotl32_S ((c_19s ^ w[14] ^ w[ 8] ^ w[ 6]), 1u); + const u32 c_23s = rotl32_S ((c_20s ^ w[15] ^ w[ 9] ^ w[ 7]), 1u); + const u32 c_24s = rotl32_S ((c_21s ^ c_16s ^ w[10] ^ w[ 8]), 1u); + const u32 c_25s = rotl32_S ((c_22s ^ c_17s ^ w[11] ^ w[ 9]), 1u); + const u32 c_26s = rotl32_S ((c_23s ^ c_18s ^ w[12] ^ w[10]), 1u); + const u32 c_27s = rotl32_S ((c_24s ^ c_19s ^ w[13] ^ w[11]), 1u); + const u32 c_28s = rotl32_S ((c_25s ^ c_20s ^ w[14] ^ w[12]), 1u); + const u32 c_29s = rotl32_S ((c_26s ^ c_21s ^ w[15] ^ w[13]), 1u); + const u32 c_30s = rotl32_S ((c_27s ^ c_22s ^ c_16s ^ w[14]), 1u); + const u32 c_31s = rotl32_S ((c_28s ^ c_23s ^ c_17s ^ w[15]), 1u); + const u32 c_32s = rotl32_S ((c_29s ^ c_24s ^ c_18s ^ c_16s), 1u); + const u32 c_33s = rotl32_S ((c_30s ^ c_25s ^ c_19s ^ c_17s), 1u); + const u32 c_34s = rotl32_S ((c_31s ^ c_26s ^ c_20s ^ c_18s), 1u); + const u32 c_35s = rotl32_S ((c_32s ^ c_27s ^ c_21s ^ c_19s), 1u); + const u32 c_36s = rotl32_S ((c_33s ^ c_28s ^ c_22s ^ c_20s), 1u); + const u32 c_37s = rotl32_S ((c_34s ^ c_29s ^ c_23s ^ c_21s), 1u); + const u32 c_38s = rotl32_S ((c_35s ^ c_30s ^ c_24s ^ c_22s), 1u); + const u32 c_39s = rotl32_S ((c_36s ^ c_31s ^ c_25s ^ c_23s), 1u); + const u32 c_40s = rotl32_S ((c_37s ^ c_32s ^ c_26s ^ c_24s), 1u); + const u32 c_41s = rotl32_S ((c_38s ^ c_33s ^ c_27s ^ c_25s), 1u); + const u32 c_42s = rotl32_S ((c_39s ^ c_34s ^ c_28s ^ c_26s), 1u); + const u32 c_43s = rotl32_S ((c_40s ^ c_35s ^ c_29s ^ c_27s), 1u); + const u32 c_44s = rotl32_S ((c_41s ^ c_36s ^ c_30s ^ c_28s), 1u); + const u32 c_45s = rotl32_S ((c_42s ^ c_37s ^ c_31s ^ c_29s), 1u); + const u32 c_46s = rotl32_S ((c_43s ^ c_38s ^ c_32s ^ c_30s), 1u); + const u32 c_47s = rotl32_S ((c_44s ^ c_39s ^ c_33s ^ c_31s), 1u); + const u32 c_48s = rotl32_S ((c_45s ^ c_40s ^ c_34s ^ c_32s), 1u); + const u32 c_49s = rotl32_S ((c_46s ^ c_41s ^ c_35s ^ c_33s), 1u); + const u32 c_50s = rotl32_S ((c_47s ^ c_42s ^ c_36s ^ c_34s), 1u); + const u32 c_51s = rotl32_S ((c_48s ^ c_43s ^ c_37s ^ c_35s), 1u); + const u32 c_52s = rotl32_S ((c_49s ^ c_44s ^ c_38s ^ c_36s), 1u); + const u32 c_53s = rotl32_S ((c_50s ^ c_45s ^ c_39s ^ c_37s), 1u); + const u32 c_54s = rotl32_S ((c_51s ^ c_46s ^ c_40s ^ c_38s), 1u); + const u32 c_55s = rotl32_S ((c_52s ^ c_47s ^ c_41s ^ c_39s), 1u); + const u32 c_56s = rotl32_S ((c_53s ^ c_48s ^ c_42s ^ c_40s), 1u); + const u32 c_57s = rotl32_S ((c_54s ^ c_49s ^ c_43s ^ c_41s), 1u); + const u32 c_58s = rotl32_S ((c_55s ^ c_50s ^ c_44s ^ c_42s), 1u); + const u32 c_59s = rotl32_S ((c_56s ^ c_51s ^ c_45s ^ c_43s), 1u); + const u32 c_60s = rotl32_S ((c_57s ^ c_52s ^ c_46s ^ c_44s), 1u); + const u32 c_61s = rotl32_S ((c_58s ^ c_53s ^ c_47s ^ c_45s), 1u); + const u32 c_62s = rotl32_S ((c_59s ^ c_54s ^ c_48s ^ c_46s), 1u); + const u32 c_63s = rotl32_S ((c_60s ^ c_55s ^ c_49s ^ c_47s), 1u); + const u32 c_64s = rotl32_S ((c_61s ^ c_56s ^ c_50s ^ c_48s), 1u); + const u32 c_65s = rotl32_S ((c_62s ^ c_57s ^ c_51s ^ c_49s), 1u); + const u32 c_66s = rotl32_S ((c_63s ^ c_58s ^ c_52s ^ c_50s), 1u); + const u32 c_67s = rotl32_S ((c_64s ^ c_59s ^ c_53s ^ c_51s), 1u); + const u32 c_68s = rotl32_S ((c_65s ^ c_60s ^ c_54s ^ c_52s), 1u); + const u32 c_69s = rotl32_S ((c_66s ^ c_61s ^ c_55s ^ c_53s), 1u); + const u32 c_70s = rotl32_S ((c_67s ^ c_62s ^ c_56s ^ c_54s), 1u); + const u32 c_71s = rotl32_S ((c_68s ^ c_63s ^ c_57s ^ c_55s), 1u); + const u32 c_72s = rotl32_S ((c_69s ^ c_64s ^ c_58s ^ c_56s), 1u); + const u32 c_73s = rotl32_S ((c_70s ^ c_65s ^ c_59s ^ c_57s), 1u); + const u32 c_74s = rotl32_S ((c_71s ^ c_66s ^ c_60s ^ c_58s), 1u); + const u32 c_75s = rotl32_S ((c_72s ^ c_67s ^ c_61s ^ c_59s), 1u); + const u32 c_76s = rotl32_S ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u); + const u32 c_77s = rotl32_S ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u); + const u32 c_78s = rotl32_S ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u); + const u32 c_79s = rotl32_S ((c_76s ^ c_71s ^ c_65s ^ c_63s), 1u); const u32 c_17sK = c_17s + SHA1C00; const u32 c_18sK = c_18s + SHA1C00; @@ -171,51 +171,51 @@ static void m00130m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k const u32 c_65sK = c_65s + SHA1C03; const u32 c_69sK = c_69s + SHA1C03; - /** + /** * loop */ u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - const u32 w0s01 = rotl32 (w0, 1u); - const u32 w0s02 = rotl32 (w0, 2u); - const u32 w0s03 = rotl32 (w0, 3u); - const u32 w0s04 = rotl32 (w0, 4u); - const u32 w0s05 = rotl32 (w0, 5u); - const u32 w0s06 = rotl32 (w0, 6u); - const u32 w0s07 = rotl32 (w0, 7u); - const u32 w0s08 = rotl32 (w0, 8u); - const u32 w0s09 = rotl32 (w0, 9u); - const u32 w0s10 = rotl32 (w0, 10u); - const u32 w0s11 = rotl32 (w0, 11u); - const u32 w0s12 = rotl32 (w0, 12u); - const u32 w0s13 = rotl32 (w0, 13u); - const u32 w0s14 = rotl32 (w0, 14u); - const u32 w0s15 = rotl32 (w0, 15u); - const u32 w0s16 = rotl32 (w0, 16u); - const u32 w0s17 = rotl32 (w0, 17u); - const u32 w0s18 = rotl32 (w0, 18u); - const u32 w0s19 = rotl32 (w0, 19u); - const u32 w0s20 = rotl32 (w0, 20u); - const u32 w0s21 = rotl32 (w0, 21u); - const u32 w0s22 = rotl32 (w0, 22U); + const u32x w0s01 = rotl32 (w0, 1u); + const u32x w0s02 = rotl32 (w0, 2u); + const u32x w0s03 = rotl32 (w0, 3u); + const u32x w0s04 = rotl32 (w0, 4u); + const u32x w0s05 = rotl32 (w0, 5u); + const u32x w0s06 = rotl32 (w0, 6u); + const u32x w0s07 = rotl32 (w0, 7u); + const u32x w0s08 = rotl32 (w0, 8u); + const u32x w0s09 = rotl32 (w0, 9u); + const u32x w0s10 = rotl32 (w0, 10u); + const u32x w0s11 = rotl32 (w0, 11u); + const u32x w0s12 = rotl32 (w0, 12u); + const u32x w0s13 = rotl32 (w0, 13u); + const u32x w0s14 = rotl32 (w0, 14u); + const u32x w0s15 = rotl32 (w0, 15u); + const u32x w0s16 = rotl32 (w0, 16u); + const u32x w0s17 = rotl32 (w0, 17u); + const u32x w0s18 = rotl32 (w0, 18u); + const u32x w0s19 = rotl32 (w0, 19u); + const u32x w0s20 = rotl32 (w0, 20u); + const u32x w0s21 = rotl32 (w0, 21u); + const u32x w0s22 = rotl32 (w0, 22U); - const u32 w0s04___w0s06 = w0s04 ^ w0s06; - const u32 w0s04___w0s08 = w0s04 ^ w0s08; - const u32 w0s08___w0s12 = w0s08 ^ w0s12; - const u32 w0s04___w0s06___w0s07 = w0s04___w0s06 ^ w0s07; + const u32x w0s04___w0s06 = w0s04 ^ w0s06; + const u32x w0s04___w0s08 = w0s04 ^ w0s08; + const u32x w0s08___w0s12 = w0s08 ^ w0s12; + const u32x w0s04___w0s06___w0s07 = w0s04___w0s06 ^ w0s07; - u32 a = SHA1M_A; - u32 b = SHA1M_B; - u32 c = SHA1M_C; - u32 d = SHA1M_D; - u32 e = SHA1M_E; + u32x a = SHA1M_A; + u32x b = SHA1M_B; + u32x c = SHA1M_C; + u32x d = SHA1M_D; + u32x e = SHA1M_E; #undef K #define K SHA1C00 @@ -314,17 +314,11 @@ static void m00130m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k SHA1_STEP (SHA1_F1 , c, d, e, a, b, (c_78s ^ w0s07 ^ w0s08 ^ w0s15 ^ w0s18 ^ w0s20)); SHA1_STEP (SHA1_F1 , b, c, d, e, a, (c_79s ^ w0s08 ^ w0s22)); - - const u32 r0 = d; - const u32 r1 = e; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_M + COMPARE_M_SIMD (d, e, c, b); } } -static void m00130s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m00130s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -337,66 +331,66 @@ static void m00130s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k * base */ - const u32 c_16s = rotl32 ((w[13] ^ w[ 8] ^ w[ 2] ), 1u); - const u32 c_17s = rotl32 ((w[14] ^ w[ 9] ^ w[ 3] ^ w[ 1]), 1u); - const u32 c_18s = rotl32 ((w[15] ^ w[10] ^ w[ 4] ^ w[ 2]), 1u); - const u32 c_19s = rotl32 ((c_16s ^ w[11] ^ w[ 5] ^ w[ 3]), 1u); - const u32 c_20s = rotl32 ((c_17s ^ w[12] ^ w[ 6] ^ w[ 4]), 1u); - const u32 c_21s = rotl32 ((c_18s ^ w[13] ^ w[ 7] ^ w[ 5]), 1u); - const u32 c_22s = rotl32 ((c_19s ^ w[14] ^ w[ 8] ^ w[ 6]), 1u); - const u32 c_23s = rotl32 ((c_20s ^ w[15] ^ w[ 9] ^ w[ 7]), 1u); - const u32 c_24s = rotl32 ((c_21s ^ c_16s ^ w[10] ^ w[ 8]), 1u); - const u32 c_25s = rotl32 ((c_22s ^ c_17s ^ w[11] ^ w[ 9]), 1u); - const u32 c_26s = rotl32 ((c_23s ^ c_18s ^ w[12] ^ w[10]), 1u); - const u32 c_27s = rotl32 ((c_24s ^ c_19s ^ w[13] ^ w[11]), 1u); - const u32 c_28s = rotl32 ((c_25s ^ c_20s ^ w[14] ^ w[12]), 1u); - const u32 c_29s = rotl32 ((c_26s ^ c_21s ^ w[15] ^ w[13]), 1u); - const u32 c_30s = rotl32 ((c_27s ^ c_22s ^ c_16s ^ w[14]), 1u); - const u32 c_31s = rotl32 ((c_28s ^ c_23s ^ c_17s ^ w[15]), 1u); - const u32 c_32s = rotl32 ((c_29s ^ c_24s ^ c_18s ^ c_16s), 1u); - const u32 c_33s = rotl32 ((c_30s ^ c_25s ^ c_19s ^ c_17s), 1u); - const u32 c_34s = rotl32 ((c_31s ^ c_26s ^ c_20s ^ c_18s), 1u); - const u32 c_35s = rotl32 ((c_32s ^ c_27s ^ c_21s ^ c_19s), 1u); - const u32 c_36s = rotl32 ((c_33s ^ c_28s ^ c_22s ^ c_20s), 1u); - const u32 c_37s = rotl32 ((c_34s ^ c_29s ^ c_23s ^ c_21s), 1u); - const u32 c_38s = rotl32 ((c_35s ^ c_30s ^ c_24s ^ c_22s), 1u); - const u32 c_39s = rotl32 ((c_36s ^ c_31s ^ c_25s ^ c_23s), 1u); - const u32 c_40s = rotl32 ((c_37s ^ c_32s ^ c_26s ^ c_24s), 1u); - const u32 c_41s = rotl32 ((c_38s ^ c_33s ^ c_27s ^ c_25s), 1u); - const u32 c_42s = rotl32 ((c_39s ^ c_34s ^ c_28s ^ c_26s), 1u); - const u32 c_43s = rotl32 ((c_40s ^ c_35s ^ c_29s ^ c_27s), 1u); - const u32 c_44s = rotl32 ((c_41s ^ c_36s ^ c_30s ^ c_28s), 1u); - const u32 c_45s = rotl32 ((c_42s ^ c_37s ^ c_31s ^ c_29s), 1u); - const u32 c_46s = rotl32 ((c_43s ^ c_38s ^ c_32s ^ c_30s), 1u); - const u32 c_47s = rotl32 ((c_44s ^ c_39s ^ c_33s ^ c_31s), 1u); - const u32 c_48s = rotl32 ((c_45s ^ c_40s ^ c_34s ^ c_32s), 1u); - const u32 c_49s = rotl32 ((c_46s ^ c_41s ^ c_35s ^ c_33s), 1u); - const u32 c_50s = rotl32 ((c_47s ^ c_42s ^ c_36s ^ c_34s), 1u); - const u32 c_51s = rotl32 ((c_48s ^ c_43s ^ c_37s ^ c_35s), 1u); - const u32 c_52s = rotl32 ((c_49s ^ c_44s ^ c_38s ^ c_36s), 1u); - const u32 c_53s = rotl32 ((c_50s ^ c_45s ^ c_39s ^ c_37s), 1u); - const u32 c_54s = rotl32 ((c_51s ^ c_46s ^ c_40s ^ c_38s), 1u); - const u32 c_55s = rotl32 ((c_52s ^ c_47s ^ c_41s ^ c_39s), 1u); - const u32 c_56s = rotl32 ((c_53s ^ c_48s ^ c_42s ^ c_40s), 1u); - const u32 c_57s = rotl32 ((c_54s ^ c_49s ^ c_43s ^ c_41s), 1u); - const u32 c_58s = rotl32 ((c_55s ^ c_50s ^ c_44s ^ c_42s), 1u); - const u32 c_59s = rotl32 ((c_56s ^ c_51s ^ c_45s ^ c_43s), 1u); - const u32 c_60s = rotl32 ((c_57s ^ c_52s ^ c_46s ^ c_44s), 1u); - const u32 c_61s = rotl32 ((c_58s ^ c_53s ^ c_47s ^ c_45s), 1u); - const u32 c_62s = rotl32 ((c_59s ^ c_54s ^ c_48s ^ c_46s), 1u); - const u32 c_63s = rotl32 ((c_60s ^ c_55s ^ c_49s ^ c_47s), 1u); - const u32 c_64s = rotl32 ((c_61s ^ c_56s ^ c_50s ^ c_48s), 1u); - const u32 c_65s = rotl32 ((c_62s ^ c_57s ^ c_51s ^ c_49s), 1u); - const u32 c_66s = rotl32 ((c_63s ^ c_58s ^ c_52s ^ c_50s), 1u); - const u32 c_67s = rotl32 ((c_64s ^ c_59s ^ c_53s ^ c_51s), 1u); - const u32 c_68s = rotl32 ((c_65s ^ c_60s ^ c_54s ^ c_52s), 1u); - const u32 c_69s = rotl32 ((c_66s ^ c_61s ^ c_55s ^ c_53s), 1u); - const u32 c_70s = rotl32 ((c_67s ^ c_62s ^ c_56s ^ c_54s), 1u); - const u32 c_71s = rotl32 ((c_68s ^ c_63s ^ c_57s ^ c_55s), 1u); - const u32 c_72s = rotl32 ((c_69s ^ c_64s ^ c_58s ^ c_56s), 1u); - const u32 c_73s = rotl32 ((c_70s ^ c_65s ^ c_59s ^ c_57s), 1u); - const u32 c_74s = rotl32 ((c_71s ^ c_66s ^ c_60s ^ c_58s), 1u); - const u32 c_75s = rotl32 ((c_72s ^ c_67s ^ c_61s ^ c_59s), 1u); + const u32 c_16s = rotl32_S ((w[13] ^ w[ 8] ^ w[ 2] ), 1u); + const u32 c_17s = rotl32_S ((w[14] ^ w[ 9] ^ w[ 3] ^ w[ 1]), 1u); + const u32 c_18s = rotl32_S ((w[15] ^ w[10] ^ w[ 4] ^ w[ 2]), 1u); + const u32 c_19s = rotl32_S ((c_16s ^ w[11] ^ w[ 5] ^ w[ 3]), 1u); + const u32 c_20s = rotl32_S ((c_17s ^ w[12] ^ w[ 6] ^ w[ 4]), 1u); + const u32 c_21s = rotl32_S ((c_18s ^ w[13] ^ w[ 7] ^ w[ 5]), 1u); + const u32 c_22s = rotl32_S ((c_19s ^ w[14] ^ w[ 8] ^ w[ 6]), 1u); + const u32 c_23s = rotl32_S ((c_20s ^ w[15] ^ w[ 9] ^ w[ 7]), 1u); + const u32 c_24s = rotl32_S ((c_21s ^ c_16s ^ w[10] ^ w[ 8]), 1u); + const u32 c_25s = rotl32_S ((c_22s ^ c_17s ^ w[11] ^ w[ 9]), 1u); + const u32 c_26s = rotl32_S ((c_23s ^ c_18s ^ w[12] ^ w[10]), 1u); + const u32 c_27s = rotl32_S ((c_24s ^ c_19s ^ w[13] ^ w[11]), 1u); + const u32 c_28s = rotl32_S ((c_25s ^ c_20s ^ w[14] ^ w[12]), 1u); + const u32 c_29s = rotl32_S ((c_26s ^ c_21s ^ w[15] ^ w[13]), 1u); + const u32 c_30s = rotl32_S ((c_27s ^ c_22s ^ c_16s ^ w[14]), 1u); + const u32 c_31s = rotl32_S ((c_28s ^ c_23s ^ c_17s ^ w[15]), 1u); + const u32 c_32s = rotl32_S ((c_29s ^ c_24s ^ c_18s ^ c_16s), 1u); + const u32 c_33s = rotl32_S ((c_30s ^ c_25s ^ c_19s ^ c_17s), 1u); + const u32 c_34s = rotl32_S ((c_31s ^ c_26s ^ c_20s ^ c_18s), 1u); + const u32 c_35s = rotl32_S ((c_32s ^ c_27s ^ c_21s ^ c_19s), 1u); + const u32 c_36s = rotl32_S ((c_33s ^ c_28s ^ c_22s ^ c_20s), 1u); + const u32 c_37s = rotl32_S ((c_34s ^ c_29s ^ c_23s ^ c_21s), 1u); + const u32 c_38s = rotl32_S ((c_35s ^ c_30s ^ c_24s ^ c_22s), 1u); + const u32 c_39s = rotl32_S ((c_36s ^ c_31s ^ c_25s ^ c_23s), 1u); + const u32 c_40s = rotl32_S ((c_37s ^ c_32s ^ c_26s ^ c_24s), 1u); + const u32 c_41s = rotl32_S ((c_38s ^ c_33s ^ c_27s ^ c_25s), 1u); + const u32 c_42s = rotl32_S ((c_39s ^ c_34s ^ c_28s ^ c_26s), 1u); + const u32 c_43s = rotl32_S ((c_40s ^ c_35s ^ c_29s ^ c_27s), 1u); + const u32 c_44s = rotl32_S ((c_41s ^ c_36s ^ c_30s ^ c_28s), 1u); + const u32 c_45s = rotl32_S ((c_42s ^ c_37s ^ c_31s ^ c_29s), 1u); + const u32 c_46s = rotl32_S ((c_43s ^ c_38s ^ c_32s ^ c_30s), 1u); + const u32 c_47s = rotl32_S ((c_44s ^ c_39s ^ c_33s ^ c_31s), 1u); + const u32 c_48s = rotl32_S ((c_45s ^ c_40s ^ c_34s ^ c_32s), 1u); + const u32 c_49s = rotl32_S ((c_46s ^ c_41s ^ c_35s ^ c_33s), 1u); + const u32 c_50s = rotl32_S ((c_47s ^ c_42s ^ c_36s ^ c_34s), 1u); + const u32 c_51s = rotl32_S ((c_48s ^ c_43s ^ c_37s ^ c_35s), 1u); + const u32 c_52s = rotl32_S ((c_49s ^ c_44s ^ c_38s ^ c_36s), 1u); + const u32 c_53s = rotl32_S ((c_50s ^ c_45s ^ c_39s ^ c_37s), 1u); + const u32 c_54s = rotl32_S ((c_51s ^ c_46s ^ c_40s ^ c_38s), 1u); + const u32 c_55s = rotl32_S ((c_52s ^ c_47s ^ c_41s ^ c_39s), 1u); + const u32 c_56s = rotl32_S ((c_53s ^ c_48s ^ c_42s ^ c_40s), 1u); + const u32 c_57s = rotl32_S ((c_54s ^ c_49s ^ c_43s ^ c_41s), 1u); + const u32 c_58s = rotl32_S ((c_55s ^ c_50s ^ c_44s ^ c_42s), 1u); + const u32 c_59s = rotl32_S ((c_56s ^ c_51s ^ c_45s ^ c_43s), 1u); + const u32 c_60s = rotl32_S ((c_57s ^ c_52s ^ c_46s ^ c_44s), 1u); + const u32 c_61s = rotl32_S ((c_58s ^ c_53s ^ c_47s ^ c_45s), 1u); + const u32 c_62s = rotl32_S ((c_59s ^ c_54s ^ c_48s ^ c_46s), 1u); + const u32 c_63s = rotl32_S ((c_60s ^ c_55s ^ c_49s ^ c_47s), 1u); + const u32 c_64s = rotl32_S ((c_61s ^ c_56s ^ c_50s ^ c_48s), 1u); + const u32 c_65s = rotl32_S ((c_62s ^ c_57s ^ c_51s ^ c_49s), 1u); + const u32 c_66s = rotl32_S ((c_63s ^ c_58s ^ c_52s ^ c_50s), 1u); + const u32 c_67s = rotl32_S ((c_64s ^ c_59s ^ c_53s ^ c_51s), 1u); + const u32 c_68s = rotl32_S ((c_65s ^ c_60s ^ c_54s ^ c_52s), 1u); + const u32 c_69s = rotl32_S ((c_66s ^ c_61s ^ c_55s ^ c_53s), 1u); + const u32 c_70s = rotl32_S ((c_67s ^ c_62s ^ c_56s ^ c_54s), 1u); + const u32 c_71s = rotl32_S ((c_68s ^ c_63s ^ c_57s ^ c_55s), 1u); + const u32 c_72s = rotl32_S ((c_69s ^ c_64s ^ c_58s ^ c_56s), 1u); + const u32 c_73s = rotl32_S ((c_70s ^ c_65s ^ c_59s ^ c_57s), 1u); + const u32 c_74s = rotl32_S ((c_71s ^ c_66s ^ c_60s ^ c_58s), 1u); + const u32 c_75s = rotl32_S ((c_72s ^ c_67s ^ c_61s ^ c_59s), 1u); const u32 c_17sK = c_17s + SHA1C00; const u32 c_18sK = c_18s + SHA1C00; @@ -430,7 +424,7 @@ static void m00130s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k * reverse */ - const u32 e_rev = rotl32 (search[1], 2u) - SHA1C03; + const u32 e_rev = rotl32_S (search[1], 2u) - SHA1C03; /** * loop @@ -438,43 +432,43 @@ static void m00130s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - const u32 w0s01 = rotl32 (w0, 1u); - const u32 w0s02 = rotl32 (w0, 2u); - const u32 w0s03 = rotl32 (w0, 3u); - const u32 w0s04 = rotl32 (w0, 4u); - const u32 w0s05 = rotl32 (w0, 5u); - const u32 w0s06 = rotl32 (w0, 6u); - const u32 w0s07 = rotl32 (w0, 7u); - const u32 w0s08 = rotl32 (w0, 8u); - const u32 w0s09 = rotl32 (w0, 9u); - const u32 w0s10 = rotl32 (w0, 10u); - const u32 w0s11 = rotl32 (w0, 11u); - const u32 w0s12 = rotl32 (w0, 12u); - const u32 w0s13 = rotl32 (w0, 13u); - const u32 w0s14 = rotl32 (w0, 14u); - const u32 w0s15 = rotl32 (w0, 15u); - const u32 w0s16 = rotl32 (w0, 16u); - const u32 w0s17 = rotl32 (w0, 17u); - const u32 w0s18 = rotl32 (w0, 18u); - const u32 w0s19 = rotl32 (w0, 19u); - const u32 w0s20 = rotl32 (w0, 20u); + const u32x w0s01 = rotl32 (w0, 1u); + const u32x w0s02 = rotl32 (w0, 2u); + const u32x w0s03 = rotl32 (w0, 3u); + const u32x w0s04 = rotl32 (w0, 4u); + const u32x w0s05 = rotl32 (w0, 5u); + const u32x w0s06 = rotl32 (w0, 6u); + const u32x w0s07 = rotl32 (w0, 7u); + const u32x w0s08 = rotl32 (w0, 8u); + const u32x w0s09 = rotl32 (w0, 9u); + const u32x w0s10 = rotl32 (w0, 10u); + const u32x w0s11 = rotl32 (w0, 11u); + const u32x w0s12 = rotl32 (w0, 12u); + const u32x w0s13 = rotl32 (w0, 13u); + const u32x w0s14 = rotl32 (w0, 14u); + const u32x w0s15 = rotl32 (w0, 15u); + const u32x w0s16 = rotl32 (w0, 16u); + const u32x w0s17 = rotl32 (w0, 17u); + const u32x w0s18 = rotl32 (w0, 18u); + const u32x w0s19 = rotl32 (w0, 19u); + const u32x w0s20 = rotl32 (w0, 20u); - const u32 w0s04___w0s06 = w0s04 ^ w0s06; - const u32 w0s04___w0s08 = w0s04 ^ w0s08; - const u32 w0s08___w0s12 = w0s08 ^ w0s12; - const u32 w0s04___w0s06___w0s07 = w0s04___w0s06 ^ w0s07; + const u32x w0s04___w0s06 = w0s04 ^ w0s06; + const u32x w0s04___w0s08 = w0s04 ^ w0s08; + const u32x w0s08___w0s12 = w0s08 ^ w0s12; + const u32x w0s04___w0s06___w0s07 = w0s04___w0s06 ^ w0s07; - u32 a = SHA1M_A; - u32 b = SHA1M_B; - u32 c = SHA1M_C; - u32 d = SHA1M_D; - u32 e = SHA1M_E; + u32x a = SHA1M_A; + u32x b = SHA1M_B; + u32x c = SHA1M_C; + u32x d = SHA1M_D; + u32x e = SHA1M_E; #undef K #define K SHA1C00 @@ -570,36 +564,28 @@ static void m00130s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k SHA1_STEP_PE (SHA1_F1, a, b, c, d, e, (c_75s ^ w0s06 ^ w0s12 ^ w0s14)); - bool q_cond = allx (e_rev != e); - - if (q_cond) continue; + if (MATCHES_NONE_VS (e, e_rev)) continue; SHA1_STEP_PB (SHA1_F1, a, b, c, d, e, 0); - const u32 c_76s = rotl32 ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u); - const u32 c_77s = rotl32 ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u); - const u32 c_78s = rotl32 ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u); - const u32 c_79s = rotl32 ((c_76s ^ c_71s ^ c_65s ^ c_63s), 1u); + const u32x c_76s = rotl32 ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u); + const u32x c_77s = rotl32 ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u); + const u32x c_78s = rotl32 ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u); + const u32x c_79s = rotl32 ((c_76s ^ c_71s ^ c_65s ^ c_63s), 1u); - const u32 w0s21 = rotl32 (w0, 21u); - const u32 w0s22 = rotl32 (w0, 22U); + const u32x w0s21 = rotl32 (w0, 21u); + const u32x w0s22 = rotl32 (w0, 22U); SHA1_STEP (SHA1_F1 , e, a, b, c, d, (c_76s ^ w0s07 ^ w0s08___w0s12 ^ w0s16 ^ w0s21)); SHA1_STEP (SHA1_F1 , d, e, a, b, c, (c_77s)); SHA1_STEP (SHA1_F1 , c, d, e, a, b, (c_78s ^ w0s07 ^ w0s08 ^ w0s15 ^ w0s18 ^ w0s20)); SHA1_STEP (SHA1_F1 , b, c, d, e, a, (c_79s ^ w0s08 ^ w0s22)); - - const u32 r0 = d; - const u32 r1 = e; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_S + COMPARE_S_SIMD (d, e, c, b); } } -__kernel void m00130_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m00130_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -637,7 +623,7 @@ __kernel void m00130_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m00130m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m00130_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m00130_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -675,7 +661,7 @@ __kernel void m00130_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m00130m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m00130_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m00130_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -713,7 +699,7 @@ __kernel void m00130_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m00130m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m00130_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m00130_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -751,7 +737,7 @@ __kernel void m00130_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m00130s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m00130_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m00130_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -789,7 +775,7 @@ __kernel void m00130_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m00130s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m00130_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m00130_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base diff --git a/OpenCL/m00140_a0.cl b/OpenCL/m00140_a0.cl index 263f5d126..9d4ce1ef6 100644 --- a/OpenCL/m00140_a0.cl +++ b/OpenCL/m00140_a0.cl @@ -126,7 +126,7 @@ __kernel void m00140_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, make_unicode (w0, w0_t, w1_t); make_unicode (w1, w2_t, w3_t); - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w0_t[0] |= salt_buf0[0]; w0_t[1] |= salt_buf0[1]; @@ -402,7 +402,7 @@ __kernel void m00140_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, make_unicode (w0, w0_t, w1_t); make_unicode (w1, w2_t, w3_t); - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w0_t[0] |= salt_buf0[0]; w0_t[1] |= salt_buf0[1]; diff --git a/OpenCL/m00140_a1.cl b/OpenCL/m00140_a1.cl index a74b118ed..e48b70f5a 100644 --- a/OpenCL/m00140_a1.cl +++ b/OpenCL/m00140_a1.cl @@ -70,7 +70,7 @@ __kernel void m00140_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -133,7 +133,7 @@ __kernel void m00140_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } /** @@ -172,7 +172,7 @@ __kernel void m00140_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, make_unicode (w0, w0_t, w1_t); make_unicode (w1, w2_t, w3_t); - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w0_t[0] |= salt_buf0[0]; w0_t[1] |= salt_buf0[1]; @@ -376,7 +376,7 @@ __kernel void m00140_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -457,7 +457,7 @@ __kernel void m00140_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } /** @@ -496,7 +496,7 @@ __kernel void m00140_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, make_unicode (w0, w0_t, w1_t); make_unicode (w1, w2_t, w3_t); - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w0_t[0] |= salt_buf0[0]; w0_t[1] |= salt_buf0[1]; diff --git a/OpenCL/m00140_a3.cl b/OpenCL/m00140_a3.cl index 897faf3c2..12b380c89 100644 --- a/OpenCL/m00140_a3.cl +++ b/OpenCL/m00140_a3.cl @@ -5,6 +5,8 @@ #define _SHA1_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,104 +18,7 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" - -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" - -static void overwrite_at (u32 sw[16], const u32 w0, const u32 salt_len) -{ - switch (salt_len) - { - case 0: sw[0] = w0; - break; - case 1: sw[0] = (sw[0] & 0xff000000) | (w0 >> 8); - sw[1] = (sw[1] & 0x00ffffff) | (w0 << 24); - break; - case 2: sw[0] = (sw[0] & 0xffff0000) | (w0 >> 16); - sw[1] = (sw[1] & 0x0000ffff) | (w0 << 16); - break; - case 3: sw[0] = (sw[0] & 0xffffff00) | (w0 >> 24); - sw[1] = (sw[1] & 0x000000ff) | (w0 << 8); - break; - case 4: sw[1] = w0; - break; - case 5: sw[1] = (sw[1] & 0xff000000) | (w0 >> 8); - sw[2] = (sw[2] & 0x00ffffff) | (w0 << 24); - break; - case 6: sw[1] = (sw[1] & 0xffff0000) | (w0 >> 16); - sw[2] = (sw[2] & 0x0000ffff) | (w0 << 16); - break; - case 7: sw[1] = (sw[1] & 0xffffff00) | (w0 >> 24); - sw[2] = (sw[2] & 0x000000ff) | (w0 << 8); - break; - case 8: sw[2] = w0; - break; - case 9: sw[2] = (sw[2] & 0xff000000) | (w0 >> 8); - sw[3] = (sw[3] & 0x00ffffff) | (w0 << 24); - break; - case 10: sw[2] = (sw[2] & 0xffff0000) | (w0 >> 16); - sw[3] = (sw[3] & 0x0000ffff) | (w0 << 16); - break; - case 11: sw[2] = (sw[2] & 0xffffff00) | (w0 >> 24); - sw[3] = (sw[3] & 0x000000ff) | (w0 << 8); - break; - case 12: sw[3] = w0; - break; - case 13: sw[3] = (sw[3] & 0xff000000) | (w0 >> 8); - sw[4] = (sw[4] & 0x00ffffff) | (w0 << 24); - break; - case 14: sw[3] = (sw[3] & 0xffff0000) | (w0 >> 16); - sw[4] = (sw[4] & 0x0000ffff) | (w0 << 16); - break; - case 15: sw[3] = (sw[3] & 0xffffff00) | (w0 >> 24); - sw[4] = (sw[4] & 0x000000ff) | (w0 << 8); - break; - case 16: sw[4] = w0; - break; - case 17: sw[4] = (sw[4] & 0xff000000) | (w0 >> 8); - sw[5] = (sw[5] & 0x00ffffff) | (w0 << 24); - break; - case 18: sw[4] = (sw[4] & 0xffff0000) | (w0 >> 16); - sw[5] = (sw[5] & 0x0000ffff) | (w0 << 16); - break; - case 19: sw[4] = (sw[4] & 0xffffff00) | (w0 >> 24); - sw[5] = (sw[5] & 0x000000ff) | (w0 << 8); - break; - case 20: sw[5] = w0; - break; - case 21: sw[5] = (sw[5] & 0xff000000) | (w0 >> 8); - sw[6] = (sw[6] & 0x00ffffff) | (w0 << 24); - break; - case 22: sw[5] = (sw[5] & 0xffff0000) | (w0 >> 16); - sw[6] = (sw[6] & 0x0000ffff) | (w0 << 16); - break; - case 23: sw[5] = (sw[5] & 0xffffff00) | (w0 >> 24); - sw[6] = (sw[6] & 0x000000ff) | (w0 << 8); - break; - case 24: sw[6] = w0; - break; - case 25: sw[6] = (sw[6] & 0xff000000) | (w0 >> 8); - sw[7] = (sw[7] & 0x00ffffff) | (w0 << 24); - break; - case 26: sw[6] = (sw[6] & 0xffff0000) | (w0 >> 16); - sw[7] = (sw[7] & 0x0000ffff) | (w0 << 16); - break; - case 27: sw[6] = (sw[6] & 0xffffff00) | (w0 >> 24); - sw[7] = (sw[7] & 0x000000ff) | (w0 << 8); - break; - case 28: sw[7] = w0; - break; - case 29: sw[7] = (sw[7] & 0xff000000) | (w0 >> 8); - sw[8] = (sw[8] & 0x00ffffff) | (w0 << 24); - break; - case 30: sw[7] = (sw[7] & 0xffff0000) | (w0 >> 16); - sw[8] = (sw[8] & 0x0000ffff) | (w0 << 16); - break; - case 31: sw[7] = (sw[7] & 0xffffff00) | (w0 >> 24); - sw[8] = (sw[8] & 0x000000ff) | (w0 << 8); - break; - } -} +#include "OpenCL/simd.c" static void m00140m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { @@ -169,24 +74,24 @@ static void m00140m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w2_t[4]; u32 w3_t[4]; - w0_t[0] = swap32 (w0[0]); - w0_t[1] = swap32 (w0[1]); - w0_t[2] = swap32 (w0[2]); - w0_t[3] = swap32 (w0[3]); - w1_t[0] = swap32 (w1[0]); - w1_t[1] = swap32 (w1[1]); - w1_t[2] = swap32 (w1[2]); - w1_t[3] = swap32 (w1[3]); - w2_t[0] = swap32 (w2[0]); - w2_t[1] = swap32 (w2[1]); - w2_t[2] = swap32 (w2[2]); - w2_t[3] = swap32 (w2[3]); - w3_t[0] = swap32 (w3[0]); - w3_t[1] = swap32 (w3[1]); - w3_t[2] = swap32 (w3[2]); - w3_t[3] = swap32 (w3[3]); + w0_t[0] = swap32_S (w0[0]); + w0_t[1] = swap32_S (w0[1]); + w0_t[2] = swap32_S (w0[2]); + w0_t[3] = swap32_S (w0[3]); + w1_t[0] = swap32_S (w1[0]); + w1_t[1] = swap32_S (w1[1]); + w1_t[2] = swap32_S (w1[2]); + w1_t[3] = swap32_S (w1[3]); + w2_t[0] = swap32_S (w2[0]); + w2_t[1] = swap32_S (w2[1]); + w2_t[2] = swap32_S (w2[2]); + w2_t[3] = swap32_S (w2[3]); + w3_t[0] = swap32_S (w3[0]); + w3_t[1] = swap32_S (w3[1]); + w3_t[2] = swap32_S (w3[2]); + w3_t[3] = swap32_S (w3[3]); - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le_S (w0_t, w1_t, w2_t, w3_t, salt_len); w0_t[0] |= salt_buf0[0]; w0_t[1] |= salt_buf0[1]; @@ -205,22 +110,22 @@ static void m00140m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w3_t[2] |= salt_buf3[2]; w3_t[3] |= salt_buf3[3]; - w0_t[0] = swap32 (w0_t[0]); - w0_t[1] = swap32 (w0_t[1]); - w0_t[2] = swap32 (w0_t[2]); - w0_t[3] = swap32 (w0_t[3]); - w1_t[0] = swap32 (w1_t[0]); - w1_t[1] = swap32 (w1_t[1]); - w1_t[2] = swap32 (w1_t[2]); - w1_t[3] = swap32 (w1_t[3]); - w2_t[0] = swap32 (w2_t[0]); - w2_t[1] = swap32 (w2_t[1]); - w2_t[2] = swap32 (w2_t[2]); - w2_t[3] = swap32 (w2_t[3]); - w3_t[0] = swap32 (w3_t[0]); - w3_t[1] = swap32 (w3_t[1]); - w3_t[2] = swap32 (w3_t[2]); - w3_t[3] = swap32 (w3_t[3]); + w0_t[0] = swap32_S (w0_t[0]); + w0_t[1] = swap32_S (w0_t[1]); + w0_t[2] = swap32_S (w0_t[2]); + w0_t[3] = swap32_S (w0_t[3]); + w1_t[0] = swap32_S (w1_t[0]); + w1_t[1] = swap32_S (w1_t[1]); + w1_t[2] = swap32_S (w1_t[2]); + w1_t[3] = swap32_S (w1_t[3]); + w2_t[0] = swap32_S (w2_t[0]); + w2_t[1] = swap32_S (w2_t[1]); + w2_t[2] = swap32_S (w2_t[2]); + w2_t[3] = swap32_S (w2_t[3]); + w3_t[0] = swap32_S (w3_t[0]); + w3_t[1] = swap32_S (w3_t[1]); + w3_t[2] = swap32_S (w3_t[2]); + w3_t[3] = swap32_S (w3_t[3]); /** * loop @@ -228,13 +133,13 @@ static void m00140m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - const u32 w0n = w0l | w0r; + const u32x w0lr = w0l | w0r; - u32 wx[16]; + u32x wx[16]; wx[ 0] = w0_t[0]; wx[ 1] = w0_t[1]; @@ -253,12 +158,12 @@ static void m00140m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le wx[14] = w3_t[2]; wx[15] = w3_t[3]; - overwrite_at (wx, w0n, salt_len); + overwrite_at_be (wx, w0lr, salt_len); - u32 w0_t[4]; - u32 w1_t[4]; - u32 w2_t[4]; - u32 w3_t[4]; + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; w0_t[0] = wx[ 0]; w0_t[1] = wx[ 1]; @@ -281,11 +186,11 @@ static void m00140m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * sha1 */ - u32 a = SHA1M_A; - u32 b = SHA1M_B; - u32 c = SHA1M_C; - u32 d = SHA1M_D; - u32 e = SHA1M_E; + u32x a = SHA1M_A; + u32x b = SHA1M_B; + u32x c = SHA1M_C; + u32x d = SHA1M_D; + u32x e = SHA1M_E; #undef K #define K SHA1C00 @@ -383,12 +288,7 @@ static void m00140m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w3_t[2]); w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w3_t[3]); - const u32 r0 = d; - const u32 r1 = e; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_M + COMPARE_M_SIMD (d, e, c, b); } } @@ -417,7 +317,7 @@ static void m00140s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * reverse */ - const u32 e_rev = rotl32 (search[1], 2u); + const u32 e_rev = rotl32_S (search[1], 2u); /** * salt @@ -464,24 +364,24 @@ static void m00140s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w2_t[4]; u32 w3_t[4]; - w0_t[0] = swap32 (w0[0]); - w0_t[1] = swap32 (w0[1]); - w0_t[2] = swap32 (w0[2]); - w0_t[3] = swap32 (w0[3]); - w1_t[0] = swap32 (w1[0]); - w1_t[1] = swap32 (w1[1]); - w1_t[2] = swap32 (w1[2]); - w1_t[3] = swap32 (w1[3]); - w2_t[0] = swap32 (w2[0]); - w2_t[1] = swap32 (w2[1]); - w2_t[2] = swap32 (w2[2]); - w2_t[3] = swap32 (w2[3]); - w3_t[0] = swap32 (w3[0]); - w3_t[1] = swap32 (w3[1]); - w3_t[2] = swap32 (w3[2]); - w3_t[3] = swap32 (w3[3]); + w0_t[0] = swap32_S (w0[0]); + w0_t[1] = swap32_S (w0[1]); + w0_t[2] = swap32_S (w0[2]); + w0_t[3] = swap32_S (w0[3]); + w1_t[0] = swap32_S (w1[0]); + w1_t[1] = swap32_S (w1[1]); + w1_t[2] = swap32_S (w1[2]); + w1_t[3] = swap32_S (w1[3]); + w2_t[0] = swap32_S (w2[0]); + w2_t[1] = swap32_S (w2[1]); + w2_t[2] = swap32_S (w2[2]); + w2_t[3] = swap32_S (w2[3]); + w3_t[0] = swap32_S (w3[0]); + w3_t[1] = swap32_S (w3[1]); + w3_t[2] = swap32_S (w3[2]); + w3_t[3] = swap32_S (w3[3]); - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le_S (w0_t, w1_t, w2_t, w3_t, salt_len); w0_t[0] |= salt_buf0[0]; w0_t[1] |= salt_buf0[1]; @@ -500,22 +400,22 @@ static void m00140s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w3_t[2] |= salt_buf3[2]; w3_t[3] |= salt_buf3[3]; - w0_t[0] = swap32 (w0_t[0]); - w0_t[1] = swap32 (w0_t[1]); - w0_t[2] = swap32 (w0_t[2]); - w0_t[3] = swap32 (w0_t[3]); - w1_t[0] = swap32 (w1_t[0]); - w1_t[1] = swap32 (w1_t[1]); - w1_t[2] = swap32 (w1_t[2]); - w1_t[3] = swap32 (w1_t[3]); - w2_t[0] = swap32 (w2_t[0]); - w2_t[1] = swap32 (w2_t[1]); - w2_t[2] = swap32 (w2_t[2]); - w2_t[3] = swap32 (w2_t[3]); - w3_t[0] = swap32 (w3_t[0]); - w3_t[1] = swap32 (w3_t[1]); - w3_t[2] = swap32 (w3_t[2]); - w3_t[3] = swap32 (w3_t[3]); + w0_t[0] = swap32_S (w0_t[0]); + w0_t[1] = swap32_S (w0_t[1]); + w0_t[2] = swap32_S (w0_t[2]); + w0_t[3] = swap32_S (w0_t[3]); + w1_t[0] = swap32_S (w1_t[0]); + w1_t[1] = swap32_S (w1_t[1]); + w1_t[2] = swap32_S (w1_t[2]); + w1_t[3] = swap32_S (w1_t[3]); + w2_t[0] = swap32_S (w2_t[0]); + w2_t[1] = swap32_S (w2_t[1]); + w2_t[2] = swap32_S (w2_t[2]); + w2_t[3] = swap32_S (w2_t[3]); + w3_t[0] = swap32_S (w3_t[0]); + w3_t[1] = swap32_S (w3_t[1]); + w3_t[2] = swap32_S (w3_t[2]); + w3_t[3] = swap32_S (w3_t[3]); /** * loop @@ -523,13 +423,13 @@ static void m00140s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - const u32 w0n = w0l | w0r; + const u32x w0lr = w0l | w0r; - u32 wx[16]; + u32x wx[16]; wx[ 0] = w0_t[0]; wx[ 1] = w0_t[1]; @@ -548,12 +448,12 @@ static void m00140s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le wx[14] = w3_t[2]; wx[15] = w3_t[3]; - overwrite_at (wx, w0n, salt_len); + overwrite_at_be (wx, w0lr, salt_len); - u32 w0_t[4]; - u32 w1_t[4]; - u32 w2_t[4]; - u32 w3_t[4]; + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; w0_t[0] = wx[ 0]; w0_t[1] = wx[ 1]; @@ -576,11 +476,11 @@ static void m00140s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * sha1 */ - u32 a = SHA1M_A; - u32 b = SHA1M_B; - u32 c = SHA1M_C; - u32 d = SHA1M_D; - u32 e = SHA1M_E; + u32x a = SHA1M_A; + u32x b = SHA1M_B; + u32x c = SHA1M_C; + u32x d = SHA1M_D; + u32x e = SHA1M_E; #undef K #define K SHA1C00 @@ -675,19 +575,13 @@ static void m00140s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w2_t[3] = rotl32 ((w2_t[0] ^ w0_t[3] ^ w3_t[1] ^ w2_t[3]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w2_t[3]); w3_t[0] = rotl32 ((w2_t[1] ^ w1_t[0] ^ w3_t[2] ^ w3_t[0]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w3_t[0]); - if (allx (e != e_rev)) continue; + if (MATCHES_NONE_VS (e, e_rev)) continue; w3_t[1] = rotl32 ((w2_t[2] ^ w1_t[1] ^ w3_t[3] ^ w3_t[1]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w3_t[1]); w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w3_t[2]); w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w3_t[3]); - - const u32 r0 = d; - const u32 r1 = e; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_S + COMPARE_S_SIMD (d, e, c, b); } } diff --git a/OpenCL/m00150_a1.cl b/OpenCL/m00150_a1.cl index 97d5488da..1526cf204 100644 --- a/OpenCL/m00150_a1.cl +++ b/OpenCL/m00150_a1.cl @@ -285,7 +285,7 @@ __kernel void m00150_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -348,7 +348,7 @@ __kernel void m00150_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -502,7 +502,7 @@ __kernel void m00150_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -577,7 +577,7 @@ __kernel void m00150_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m00150_a3.cl b/OpenCL/m00150_a3.cl index e510e0c3c..219e3cd7a 100644 --- a/OpenCL/m00150_a3.cl +++ b/OpenCL/m00150_a3.cl @@ -5,6 +5,8 @@ #define _SHA1_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,34 +18,32 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" +#include "OpenCL/simd.c" -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" - -static void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) +static void sha1_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[5]) { - u32 A = digest[0]; - u32 B = digest[1]; - u32 C = digest[2]; - u32 D = digest[3]; - u32 E = digest[4]; + u32x A = digest[0]; + u32x B = digest[1]; + u32x C = digest[2]; + u32x D = digest[3]; + u32x E = digest[4]; - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; + u32x w0_t = w0[0]; + u32x w1_t = w0[1]; + u32x w2_t = w0[2]; + u32x w3_t = w0[3]; + u32x w4_t = w1[0]; + u32x w5_t = w1[1]; + u32x w6_t = w1[2]; + u32x w7_t = w1[3]; + u32x w8_t = w2[0]; + u32x w9_t = w2[1]; + u32x wa_t = w2[2]; + u32x wb_t = w2[3]; + u32x wc_t = w3[0]; + u32x wd_t = w3[1]; + u32x we_t = w3[2]; + u32x wf_t = w3[3]; #undef K #define K SHA1C00 @@ -148,7 +148,7 @@ static void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], c digest[4] += E; } -static void hmac_sha1_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[5], u32 opad[5]) +static void hmac_sha1_pad (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[5], u32x opad[5]) { w0[0] = w0[0] ^ 0x36363636; w0[1] = w0[1] ^ 0x36363636; @@ -201,7 +201,7 @@ static void hmac_sha1_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[ sha1_transform (w0, w1, w2, w3, opad); } -static void hmac_sha1_run (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[5], u32 opad[5], u32 digest[5]) +static void hmac_sha1_run (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[5], u32x opad[5], u32x digest[5]) { digest[0] = ipad[0]; digest[1] = ipad[1]; @@ -272,46 +272,46 @@ static void m00150m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; /** * pads */ - u32 w0_t[4]; + u32x w0_t[4]; - w0_t[0] = w0[0]; + w0_t[0] = w0lr; w0_t[1] = w0[1]; w0_t[2] = w0[2]; w0_t[3] = w0[3]; - u32 w1_t[4]; + u32x w1_t[4]; w1_t[0] = w1[0]; w1_t[1] = w1[1]; w1_t[2] = w1[2]; w1_t[3] = w1[3]; - u32 w2_t[4]; + u32x w2_t[4]; w2_t[0] = w2[0]; w2_t[1] = w2[1]; w2_t[2] = w2[2]; w2_t[3] = w2[3]; - u32 w3_t[4]; + u32x w3_t[4]; w3_t[0] = w3[0]; w3_t[1] = w3[1]; w3_t[2] = 0; w3_t[3] = 0; - u32 ipad[5]; - u32 opad[5]; + u32x ipad[5]; + u32x opad[5]; hmac_sha1_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); @@ -332,16 +332,11 @@ static void m00150m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w3_t[2] = 0; w3_t[3] = (64 + salt_len) * 8; - u32 digest[5]; + u32x digest[5]; hmac_sha1_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); - const u32 r0 = digest[3]; - const u32 r1 = digest[4]; - const u32 r2 = digest[2]; - const u32 r3 = digest[1]; - - #include COMPARE_M + COMPARE_M_SIMD (digest[3], digest[4], digest[2], digest[1]); } } @@ -392,46 +387,46 @@ static void m00150s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; /** * pads */ - u32 w0_t[4]; + u32x w0_t[4]; - w0_t[0] = w0[0]; + w0_t[0] = w0lr; w0_t[1] = w0[1]; w0_t[2] = w0[2]; w0_t[3] = w0[3]; - u32 w1_t[4]; + u32x w1_t[4]; w1_t[0] = w1[0]; w1_t[1] = w1[1]; w1_t[2] = w1[2]; w1_t[3] = w1[3]; - u32 w2_t[4]; + u32x w2_t[4]; w2_t[0] = w2[0]; w2_t[1] = w2[1]; w2_t[2] = w2[2]; w2_t[3] = w2[3]; - u32 w3_t[4]; + u32x w3_t[4]; w3_t[0] = w3[0]; w3_t[1] = w3[1]; w3_t[2] = 0; w3_t[3] = 0; - u32 ipad[5]; - u32 opad[5]; + u32x ipad[5]; + u32x opad[5]; hmac_sha1_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); @@ -452,16 +447,11 @@ static void m00150s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w3_t[2] = 0; w3_t[3] = (64 + salt_len) * 8; - u32 digest[5]; + u32x digest[5]; hmac_sha1_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); - const u32 r0 = digest[3]; - const u32 r1 = digest[4]; - const u32 r2 = digest[2]; - const u32 r3 = digest[1]; - - #include COMPARE_S + COMPARE_S_SIMD (digest[3], digest[4], digest[2], digest[1]); } } diff --git a/OpenCL/m00160_a1.cl b/OpenCL/m00160_a1.cl index ffe13e0b1..adba7db6f 100644 --- a/OpenCL/m00160_a1.cl +++ b/OpenCL/m00160_a1.cl @@ -285,7 +285,7 @@ __kernel void m00160_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -383,7 +383,7 @@ __kernel void m00160_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -502,7 +502,7 @@ __kernel void m00160_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -612,7 +612,7 @@ __kernel void m00160_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m00160_a3.cl b/OpenCL/m00160_a3.cl index 4f3a34af0..126927be8 100644 --- a/OpenCL/m00160_a3.cl +++ b/OpenCL/m00160_a3.cl @@ -5,6 +5,8 @@ #define _SHA1_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,34 +18,32 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" +#include "OpenCL/simd.c" -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" - -static void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) +static void sha1_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[5]) { - u32 A = digest[0]; - u32 B = digest[1]; - u32 C = digest[2]; - u32 D = digest[3]; - u32 E = digest[4]; + u32x A = digest[0]; + u32x B = digest[1]; + u32x C = digest[2]; + u32x D = digest[3]; + u32x E = digest[4]; - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; + u32x w0_t = w0[0]; + u32x w1_t = w0[1]; + u32x w2_t = w0[2]; + u32x w3_t = w0[3]; + u32x w4_t = w1[0]; + u32x w5_t = w1[1]; + u32x w6_t = w1[2]; + u32x w7_t = w1[3]; + u32x w8_t = w2[0]; + u32x w9_t = w2[1]; + u32x wa_t = w2[2]; + u32x wb_t = w2[3]; + u32x wc_t = w3[0]; + u32x wd_t = w3[1]; + u32x we_t = w3[2]; + u32x wf_t = w3[3]; #undef K #define K SHA1C00 @@ -148,7 +148,7 @@ static void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], c digest[4] += E; } -static void hmac_sha1_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[5], u32 opad[5]) +static void hmac_sha1_pad (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[5], u32x opad[5]) { w0[0] = w0[0] ^ 0x36363636; w0[1] = w0[1] ^ 0x36363636; @@ -201,7 +201,7 @@ static void hmac_sha1_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[ sha1_transform (w0, w1, w2, w3, opad); } -static void hmac_sha1_run (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[5], u32 opad[5], u32 digest[5]) +static void hmac_sha1_run (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[5], u32x opad[5], u32x digest[5]) { digest[0] = ipad[0]; digest[1] = ipad[1]; @@ -268,36 +268,36 @@ static void m00160m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * pads */ - u32 w0_t[4]; + u32x w0_t[4]; w0_t[0] = swap32 (salt_buf0[0]); w0_t[1] = swap32 (salt_buf0[1]); w0_t[2] = swap32 (salt_buf0[2]); w0_t[3] = swap32 (salt_buf0[3]); - u32 w1_t[4]; + u32x w1_t[4]; w1_t[0] = swap32 (salt_buf1[0]); w1_t[1] = swap32 (salt_buf1[1]); w1_t[2] = swap32 (salt_buf1[2]); w1_t[3] = swap32 (salt_buf1[3]); - u32 w2_t[4]; + u32x w2_t[4]; w2_t[0] = 0; w2_t[1] = 0; w2_t[2] = 0; w2_t[3] = 0; - u32 w3_t[4]; + u32x w3_t[4]; w3_t[0] = 0; w3_t[1] = 0; w3_t[2] = 0; w3_t[3] = 0; - u32 ipad[5]; - u32 opad[5]; + u32x ipad[5]; + u32x opad[5]; hmac_sha1_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); @@ -307,13 +307,13 @@ static void m00160m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - w0_t[0] = w0[0]; + w0_t[0] = w0lr; w0_t[1] = w0[1]; w0_t[2] = w0[2]; w0_t[3] = w0[3]; @@ -330,16 +330,11 @@ static void m00160m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w3_t[2] = 0; w3_t[3] = (64 + pw_len) * 8; - u32 digest[5]; + u32x digest[5]; hmac_sha1_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); - const u32 r0 = digest[3]; - const u32 r1 = digest[4]; - const u32 r2 = digest[2]; - const u32 r3 = digest[1]; - - #include COMPARE_M + COMPARE_M_SIMD (digest[3], digest[4], digest[2], digest[1]); } } @@ -374,36 +369,36 @@ static void m00160s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * pads */ - u32 w0_t[4]; + u32x w0_t[4]; w0_t[0] = swap32 (salt_buf0[0]); w0_t[1] = swap32 (salt_buf0[1]); w0_t[2] = swap32 (salt_buf0[2]); w0_t[3] = swap32 (salt_buf0[3]); - u32 w1_t[4]; + u32x w1_t[4]; w1_t[0] = swap32 (salt_buf1[0]); w1_t[1] = swap32 (salt_buf1[1]); w1_t[2] = swap32 (salt_buf1[2]); w1_t[3] = swap32 (salt_buf1[3]); - u32 w2_t[4]; + u32x w2_t[4]; w2_t[0] = 0; w2_t[1] = 0; w2_t[2] = 0; w2_t[3] = 0; - u32 w3_t[4]; + u32x w3_t[4]; w3_t[0] = 0; w3_t[1] = 0; w3_t[2] = 0; w3_t[3] = 0; - u32 ipad[5]; - u32 opad[5]; + u32x ipad[5]; + u32x opad[5]; hmac_sha1_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); @@ -425,13 +420,13 @@ static void m00160s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - w0_t[0] = w0[0]; + w0_t[0] = w0lr; w0_t[1] = w0[1]; w0_t[2] = w0[2]; w0_t[3] = w0[3]; @@ -448,16 +443,11 @@ static void m00160s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w3_t[2] = 0; w3_t[3] = (64 + pw_len) * 8; - u32 digest[5]; + u32x digest[5]; hmac_sha1_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); - const u32 r0 = digest[3]; - const u32 r1 = digest[4]; - const u32 r2 = digest[2]; - const u32 r3 = digest[1]; - - #include COMPARE_S + COMPARE_S_SIMD (digest[3], digest[4], digest[2], digest[1]); } } diff --git a/OpenCL/m00190_a1.cl b/OpenCL/m00190_a1.cl index b1eedc0ae..3eec61cb5 100644 --- a/OpenCL/m00190_a1.cl +++ b/OpenCL/m00190_a1.cl @@ -70,7 +70,7 @@ __kernel void m00190_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -115,7 +115,7 @@ __kernel void m00190_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordr0, wordr1, pw_r_len); - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -354,7 +354,7 @@ __kernel void m00190_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -411,7 +411,7 @@ __kernel void m00190_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordr0, wordr1, pw_r_len); - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m00190_a3.cl b/OpenCL/m00190_a3.cl index e5b316546..2ff5a8173 100644 --- a/OpenCL/m00190_a3.cl +++ b/OpenCL/m00190_a3.cl @@ -5,6 +5,8 @@ #define _SHA1_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,11 +18,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" +#include "OpenCL/simd.c" -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" - -static void m00190m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m00190m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -33,66 +33,66 @@ static void m00190m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k * base */ - const u32 c_16s = rotl32 ((w[13] ^ w[ 8] ^ w[ 2] ), 1u); - const u32 c_17s = rotl32 ((w[14] ^ w[ 9] ^ w[ 3] ^ w[ 1]), 1u); - const u32 c_18s = rotl32 ((w[15] ^ w[10] ^ w[ 4] ^ w[ 2]), 1u); - const u32 c_19s = rotl32 ((c_16s ^ w[11] ^ w[ 5] ^ w[ 3]), 1u); - const u32 c_20s = rotl32 ((c_17s ^ w[12] ^ w[ 6] ^ w[ 4]), 1u); - const u32 c_21s = rotl32 ((c_18s ^ w[13] ^ w[ 7] ^ w[ 5]), 1u); - const u32 c_22s = rotl32 ((c_19s ^ w[14] ^ w[ 8] ^ w[ 6]), 1u); - const u32 c_23s = rotl32 ((c_20s ^ w[15] ^ w[ 9] ^ w[ 7]), 1u); - const u32 c_24s = rotl32 ((c_21s ^ c_16s ^ w[10] ^ w[ 8]), 1u); - const u32 c_25s = rotl32 ((c_22s ^ c_17s ^ w[11] ^ w[ 9]), 1u); - const u32 c_26s = rotl32 ((c_23s ^ c_18s ^ w[12] ^ w[10]), 1u); - const u32 c_27s = rotl32 ((c_24s ^ c_19s ^ w[13] ^ w[11]), 1u); - const u32 c_28s = rotl32 ((c_25s ^ c_20s ^ w[14] ^ w[12]), 1u); - const u32 c_29s = rotl32 ((c_26s ^ c_21s ^ w[15] ^ w[13]), 1u); - const u32 c_30s = rotl32 ((c_27s ^ c_22s ^ c_16s ^ w[14]), 1u); - const u32 c_31s = rotl32 ((c_28s ^ c_23s ^ c_17s ^ w[15]), 1u); - const u32 c_32s = rotl32 ((c_29s ^ c_24s ^ c_18s ^ c_16s), 1u); - const u32 c_33s = rotl32 ((c_30s ^ c_25s ^ c_19s ^ c_17s), 1u); - const u32 c_34s = rotl32 ((c_31s ^ c_26s ^ c_20s ^ c_18s), 1u); - const u32 c_35s = rotl32 ((c_32s ^ c_27s ^ c_21s ^ c_19s), 1u); - const u32 c_36s = rotl32 ((c_33s ^ c_28s ^ c_22s ^ c_20s), 1u); - const u32 c_37s = rotl32 ((c_34s ^ c_29s ^ c_23s ^ c_21s), 1u); - const u32 c_38s = rotl32 ((c_35s ^ c_30s ^ c_24s ^ c_22s), 1u); - const u32 c_39s = rotl32 ((c_36s ^ c_31s ^ c_25s ^ c_23s), 1u); - const u32 c_40s = rotl32 ((c_37s ^ c_32s ^ c_26s ^ c_24s), 1u); - const u32 c_41s = rotl32 ((c_38s ^ c_33s ^ c_27s ^ c_25s), 1u); - const u32 c_42s = rotl32 ((c_39s ^ c_34s ^ c_28s ^ c_26s), 1u); - const u32 c_43s = rotl32 ((c_40s ^ c_35s ^ c_29s ^ c_27s), 1u); - const u32 c_44s = rotl32 ((c_41s ^ c_36s ^ c_30s ^ c_28s), 1u); - const u32 c_45s = rotl32 ((c_42s ^ c_37s ^ c_31s ^ c_29s), 1u); - const u32 c_46s = rotl32 ((c_43s ^ c_38s ^ c_32s ^ c_30s), 1u); - const u32 c_47s = rotl32 ((c_44s ^ c_39s ^ c_33s ^ c_31s), 1u); - const u32 c_48s = rotl32 ((c_45s ^ c_40s ^ c_34s ^ c_32s), 1u); - const u32 c_49s = rotl32 ((c_46s ^ c_41s ^ c_35s ^ c_33s), 1u); - const u32 c_50s = rotl32 ((c_47s ^ c_42s ^ c_36s ^ c_34s), 1u); - const u32 c_51s = rotl32 ((c_48s ^ c_43s ^ c_37s ^ c_35s), 1u); - const u32 c_52s = rotl32 ((c_49s ^ c_44s ^ c_38s ^ c_36s), 1u); - const u32 c_53s = rotl32 ((c_50s ^ c_45s ^ c_39s ^ c_37s), 1u); - const u32 c_54s = rotl32 ((c_51s ^ c_46s ^ c_40s ^ c_38s), 1u); - const u32 c_55s = rotl32 ((c_52s ^ c_47s ^ c_41s ^ c_39s), 1u); - const u32 c_56s = rotl32 ((c_53s ^ c_48s ^ c_42s ^ c_40s), 1u); - const u32 c_57s = rotl32 ((c_54s ^ c_49s ^ c_43s ^ c_41s), 1u); - const u32 c_58s = rotl32 ((c_55s ^ c_50s ^ c_44s ^ c_42s), 1u); - const u32 c_59s = rotl32 ((c_56s ^ c_51s ^ c_45s ^ c_43s), 1u); - const u32 c_60s = rotl32 ((c_57s ^ c_52s ^ c_46s ^ c_44s), 1u); - const u32 c_61s = rotl32 ((c_58s ^ c_53s ^ c_47s ^ c_45s), 1u); - const u32 c_62s = rotl32 ((c_59s ^ c_54s ^ c_48s ^ c_46s), 1u); - const u32 c_63s = rotl32 ((c_60s ^ c_55s ^ c_49s ^ c_47s), 1u); - const u32 c_64s = rotl32 ((c_61s ^ c_56s ^ c_50s ^ c_48s), 1u); - const u32 c_65s = rotl32 ((c_62s ^ c_57s ^ c_51s ^ c_49s), 1u); - const u32 c_66s = rotl32 ((c_63s ^ c_58s ^ c_52s ^ c_50s), 1u); - const u32 c_67s = rotl32 ((c_64s ^ c_59s ^ c_53s ^ c_51s), 1u); - const u32 c_68s = rotl32 ((c_65s ^ c_60s ^ c_54s ^ c_52s), 1u); - const u32 c_69s = rotl32 ((c_66s ^ c_61s ^ c_55s ^ c_53s), 1u); - const u32 c_70s = rotl32 ((c_67s ^ c_62s ^ c_56s ^ c_54s), 1u); - const u32 c_71s = rotl32 ((c_68s ^ c_63s ^ c_57s ^ c_55s), 1u); - const u32 c_72s = rotl32 ((c_69s ^ c_64s ^ c_58s ^ c_56s), 1u); - const u32 c_73s = rotl32 ((c_70s ^ c_65s ^ c_59s ^ c_57s), 1u); - const u32 c_74s = rotl32 ((c_71s ^ c_66s ^ c_60s ^ c_58s), 1u); - const u32 c_75s = rotl32 ((c_72s ^ c_67s ^ c_61s ^ c_59s), 1u); + const u32 c_16s = rotl32_S ((w[13] ^ w[ 8] ^ w[ 2] ), 1u); + const u32 c_17s = rotl32_S ((w[14] ^ w[ 9] ^ w[ 3] ^ w[ 1]), 1u); + const u32 c_18s = rotl32_S ((w[15] ^ w[10] ^ w[ 4] ^ w[ 2]), 1u); + const u32 c_19s = rotl32_S ((c_16s ^ w[11] ^ w[ 5] ^ w[ 3]), 1u); + const u32 c_20s = rotl32_S ((c_17s ^ w[12] ^ w[ 6] ^ w[ 4]), 1u); + const u32 c_21s = rotl32_S ((c_18s ^ w[13] ^ w[ 7] ^ w[ 5]), 1u); + const u32 c_22s = rotl32_S ((c_19s ^ w[14] ^ w[ 8] ^ w[ 6]), 1u); + const u32 c_23s = rotl32_S ((c_20s ^ w[15] ^ w[ 9] ^ w[ 7]), 1u); + const u32 c_24s = rotl32_S ((c_21s ^ c_16s ^ w[10] ^ w[ 8]), 1u); + const u32 c_25s = rotl32_S ((c_22s ^ c_17s ^ w[11] ^ w[ 9]), 1u); + const u32 c_26s = rotl32_S ((c_23s ^ c_18s ^ w[12] ^ w[10]), 1u); + const u32 c_27s = rotl32_S ((c_24s ^ c_19s ^ w[13] ^ w[11]), 1u); + const u32 c_28s = rotl32_S ((c_25s ^ c_20s ^ w[14] ^ w[12]), 1u); + const u32 c_29s = rotl32_S ((c_26s ^ c_21s ^ w[15] ^ w[13]), 1u); + const u32 c_30s = rotl32_S ((c_27s ^ c_22s ^ c_16s ^ w[14]), 1u); + const u32 c_31s = rotl32_S ((c_28s ^ c_23s ^ c_17s ^ w[15]), 1u); + const u32 c_32s = rotl32_S ((c_29s ^ c_24s ^ c_18s ^ c_16s), 1u); + const u32 c_33s = rotl32_S ((c_30s ^ c_25s ^ c_19s ^ c_17s), 1u); + const u32 c_34s = rotl32_S ((c_31s ^ c_26s ^ c_20s ^ c_18s), 1u); + const u32 c_35s = rotl32_S ((c_32s ^ c_27s ^ c_21s ^ c_19s), 1u); + const u32 c_36s = rotl32_S ((c_33s ^ c_28s ^ c_22s ^ c_20s), 1u); + const u32 c_37s = rotl32_S ((c_34s ^ c_29s ^ c_23s ^ c_21s), 1u); + const u32 c_38s = rotl32_S ((c_35s ^ c_30s ^ c_24s ^ c_22s), 1u); + const u32 c_39s = rotl32_S ((c_36s ^ c_31s ^ c_25s ^ c_23s), 1u); + const u32 c_40s = rotl32_S ((c_37s ^ c_32s ^ c_26s ^ c_24s), 1u); + const u32 c_41s = rotl32_S ((c_38s ^ c_33s ^ c_27s ^ c_25s), 1u); + const u32 c_42s = rotl32_S ((c_39s ^ c_34s ^ c_28s ^ c_26s), 1u); + const u32 c_43s = rotl32_S ((c_40s ^ c_35s ^ c_29s ^ c_27s), 1u); + const u32 c_44s = rotl32_S ((c_41s ^ c_36s ^ c_30s ^ c_28s), 1u); + const u32 c_45s = rotl32_S ((c_42s ^ c_37s ^ c_31s ^ c_29s), 1u); + const u32 c_46s = rotl32_S ((c_43s ^ c_38s ^ c_32s ^ c_30s), 1u); + const u32 c_47s = rotl32_S ((c_44s ^ c_39s ^ c_33s ^ c_31s), 1u); + const u32 c_48s = rotl32_S ((c_45s ^ c_40s ^ c_34s ^ c_32s), 1u); + const u32 c_49s = rotl32_S ((c_46s ^ c_41s ^ c_35s ^ c_33s), 1u); + const u32 c_50s = rotl32_S ((c_47s ^ c_42s ^ c_36s ^ c_34s), 1u); + const u32 c_51s = rotl32_S ((c_48s ^ c_43s ^ c_37s ^ c_35s), 1u); + const u32 c_52s = rotl32_S ((c_49s ^ c_44s ^ c_38s ^ c_36s), 1u); + const u32 c_53s = rotl32_S ((c_50s ^ c_45s ^ c_39s ^ c_37s), 1u); + const u32 c_54s = rotl32_S ((c_51s ^ c_46s ^ c_40s ^ c_38s), 1u); + const u32 c_55s = rotl32_S ((c_52s ^ c_47s ^ c_41s ^ c_39s), 1u); + const u32 c_56s = rotl32_S ((c_53s ^ c_48s ^ c_42s ^ c_40s), 1u); + const u32 c_57s = rotl32_S ((c_54s ^ c_49s ^ c_43s ^ c_41s), 1u); + const u32 c_58s = rotl32_S ((c_55s ^ c_50s ^ c_44s ^ c_42s), 1u); + const u32 c_59s = rotl32_S ((c_56s ^ c_51s ^ c_45s ^ c_43s), 1u); + const u32 c_60s = rotl32_S ((c_57s ^ c_52s ^ c_46s ^ c_44s), 1u); + const u32 c_61s = rotl32_S ((c_58s ^ c_53s ^ c_47s ^ c_45s), 1u); + const u32 c_62s = rotl32_S ((c_59s ^ c_54s ^ c_48s ^ c_46s), 1u); + const u32 c_63s = rotl32_S ((c_60s ^ c_55s ^ c_49s ^ c_47s), 1u); + const u32 c_64s = rotl32_S ((c_61s ^ c_56s ^ c_50s ^ c_48s), 1u); + const u32 c_65s = rotl32_S ((c_62s ^ c_57s ^ c_51s ^ c_49s), 1u); + const u32 c_66s = rotl32_S ((c_63s ^ c_58s ^ c_52s ^ c_50s), 1u); + const u32 c_67s = rotl32_S ((c_64s ^ c_59s ^ c_53s ^ c_51s), 1u); + const u32 c_68s = rotl32_S ((c_65s ^ c_60s ^ c_54s ^ c_52s), 1u); + const u32 c_69s = rotl32_S ((c_66s ^ c_61s ^ c_55s ^ c_53s), 1u); + const u32 c_70s = rotl32_S ((c_67s ^ c_62s ^ c_56s ^ c_54s), 1u); + const u32 c_71s = rotl32_S ((c_68s ^ c_63s ^ c_57s ^ c_55s), 1u); + const u32 c_72s = rotl32_S ((c_69s ^ c_64s ^ c_58s ^ c_56s), 1u); + const u32 c_73s = rotl32_S ((c_70s ^ c_65s ^ c_59s ^ c_57s), 1u); + const u32 c_74s = rotl32_S ((c_71s ^ c_66s ^ c_60s ^ c_58s), 1u); + const u32 c_75s = rotl32_S ((c_72s ^ c_67s ^ c_61s ^ c_59s), 1u); const u32 c_17sK = c_17s + SHA1C00; const u32 c_18sK = c_18s + SHA1C00; @@ -116,43 +116,43 @@ static void m00190m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - const u32 w0s01 = rotl32 (w0, 1u); - const u32 w0s02 = rotl32 (w0, 2u); - const u32 w0s03 = rotl32 (w0, 3u); - const u32 w0s04 = rotl32 (w0, 4u); - const u32 w0s05 = rotl32 (w0, 5u); - const u32 w0s06 = rotl32 (w0, 6u); - const u32 w0s07 = rotl32 (w0, 7u); - const u32 w0s08 = rotl32 (w0, 8u); - const u32 w0s09 = rotl32 (w0, 9u); - const u32 w0s10 = rotl32 (w0, 10u); - const u32 w0s11 = rotl32 (w0, 11u); - const u32 w0s12 = rotl32 (w0, 12u); - const u32 w0s13 = rotl32 (w0, 13u); - const u32 w0s14 = rotl32 (w0, 14u); - const u32 w0s15 = rotl32 (w0, 15u); - const u32 w0s16 = rotl32 (w0, 16u); - const u32 w0s17 = rotl32 (w0, 17u); - const u32 w0s18 = rotl32 (w0, 18u); - const u32 w0s19 = rotl32 (w0, 19u); - const u32 w0s20 = rotl32 (w0, 20u); + const u32x w0s01 = rotl32 (w0, 1u); + const u32x w0s02 = rotl32 (w0, 2u); + const u32x w0s03 = rotl32 (w0, 3u); + const u32x w0s04 = rotl32 (w0, 4u); + const u32x w0s05 = rotl32 (w0, 5u); + const u32x w0s06 = rotl32 (w0, 6u); + const u32x w0s07 = rotl32 (w0, 7u); + const u32x w0s08 = rotl32 (w0, 8u); + const u32x w0s09 = rotl32 (w0, 9u); + const u32x w0s10 = rotl32 (w0, 10u); + const u32x w0s11 = rotl32 (w0, 11u); + const u32x w0s12 = rotl32 (w0, 12u); + const u32x w0s13 = rotl32 (w0, 13u); + const u32x w0s14 = rotl32 (w0, 14u); + const u32x w0s15 = rotl32 (w0, 15u); + const u32x w0s16 = rotl32 (w0, 16u); + const u32x w0s17 = rotl32 (w0, 17u); + const u32x w0s18 = rotl32 (w0, 18u); + const u32x w0s19 = rotl32 (w0, 19u); + const u32x w0s20 = rotl32 (w0, 20u); - const u32 w0s04___w0s06 = w0s04 ^ w0s06; - const u32 w0s04___w0s08 = w0s04 ^ w0s08; - const u32 w0s08___w0s12 = w0s08 ^ w0s12; - const u32 w0s04___w0s06___w0s07 = w0s04___w0s06 ^ w0s07; + const u32x w0s04___w0s06 = w0s04 ^ w0s06; + const u32x w0s04___w0s08 = w0s04 ^ w0s08; + const u32x w0s08___w0s12 = w0s08 ^ w0s12; + const u32x w0s04___w0s06___w0s07 = w0s04___w0s06 ^ w0s07; - u32 a = SHA1M_A; - u32 b = SHA1M_B; - u32 c = SHA1M_C; - u32 d = SHA1M_D; - u32 e = SHA1M_E; + u32x a = SHA1M_A; + u32x b = SHA1M_B; + u32x c = SHA1M_C; + u32x d = SHA1M_D; + u32x e = SHA1M_E; #undef K #define K SHA1C00 @@ -247,13 +247,13 @@ static void m00190m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k SHA1_STEP (SHA1_F1 , b, c, d, e, a, (c_74s ^ w0s08 ^ w0s16)); SHA1_STEP (SHA1_F1 , a, b, c, d, e, (c_75s ^ w0s06 ^ w0s12 ^ w0s14)); - const u32 c_76s = rotl32 ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u); - const u32 c_77s = rotl32 ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u); - const u32 c_78s = rotl32 ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u); - const u32 c_79s = rotl32 ((c_76s ^ c_71s ^ c_65s ^ c_63s), 1u); + const u32x c_76s = rotl32 ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u); + const u32x c_77s = rotl32 ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u); + const u32x c_78s = rotl32 ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u); + const u32x c_79s = rotl32 ((c_76s ^ c_71s ^ c_65s ^ c_63s), 1u); - const u32 w0s21 = rotl32 (w0, 21u); - const u32 w0s22 = rotl32 (w0, 22U); + const u32x w0s21 = rotl32 (w0, 21u); + const u32x w0s22 = rotl32 (w0, 22U); SHA1_STEP (SHA1_F1 , e, a, b, c, d, (c_76s ^ w0s07 ^ w0s08___w0s12 ^ w0s16 ^ w0s21)); SHA1_STEP (SHA1_F1 , d, e, a, b, c, (c_77s)); @@ -265,29 +265,15 @@ static void m00190m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k d += SHA1M_D; c += SHA1M_C; - { - const u32 r0 = a; - const u32 r1 = e; - const u32 r2 = d; - const u32 r3 = c; - - #include COMPARE_M - } + COMPARE_M_SIMD (a, e, d, c); a &= 0x00000fff; - { - const u32 r0 = a; - const u32 r1 = e; - const u32 r2 = d; - const u32 r3 = c; - - #include COMPARE_M - } + COMPARE_M_SIMD (a, e, d, c); } } -static void m00190s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m00190s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -300,66 +286,66 @@ static void m00190s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k * base */ - const u32 c_16s = rotl32 ((w[13] ^ w[ 8] ^ w[ 2] ), 1u); - const u32 c_17s = rotl32 ((w[14] ^ w[ 9] ^ w[ 3] ^ w[ 1]), 1u); - const u32 c_18s = rotl32 ((w[15] ^ w[10] ^ w[ 4] ^ w[ 2]), 1u); - const u32 c_19s = rotl32 ((c_16s ^ w[11] ^ w[ 5] ^ w[ 3]), 1u); - const u32 c_20s = rotl32 ((c_17s ^ w[12] ^ w[ 6] ^ w[ 4]), 1u); - const u32 c_21s = rotl32 ((c_18s ^ w[13] ^ w[ 7] ^ w[ 5]), 1u); - const u32 c_22s = rotl32 ((c_19s ^ w[14] ^ w[ 8] ^ w[ 6]), 1u); - const u32 c_23s = rotl32 ((c_20s ^ w[15] ^ w[ 9] ^ w[ 7]), 1u); - const u32 c_24s = rotl32 ((c_21s ^ c_16s ^ w[10] ^ w[ 8]), 1u); - const u32 c_25s = rotl32 ((c_22s ^ c_17s ^ w[11] ^ w[ 9]), 1u); - const u32 c_26s = rotl32 ((c_23s ^ c_18s ^ w[12] ^ w[10]), 1u); - const u32 c_27s = rotl32 ((c_24s ^ c_19s ^ w[13] ^ w[11]), 1u); - const u32 c_28s = rotl32 ((c_25s ^ c_20s ^ w[14] ^ w[12]), 1u); - const u32 c_29s = rotl32 ((c_26s ^ c_21s ^ w[15] ^ w[13]), 1u); - const u32 c_30s = rotl32 ((c_27s ^ c_22s ^ c_16s ^ w[14]), 1u); - const u32 c_31s = rotl32 ((c_28s ^ c_23s ^ c_17s ^ w[15]), 1u); - const u32 c_32s = rotl32 ((c_29s ^ c_24s ^ c_18s ^ c_16s), 1u); - const u32 c_33s = rotl32 ((c_30s ^ c_25s ^ c_19s ^ c_17s), 1u); - const u32 c_34s = rotl32 ((c_31s ^ c_26s ^ c_20s ^ c_18s), 1u); - const u32 c_35s = rotl32 ((c_32s ^ c_27s ^ c_21s ^ c_19s), 1u); - const u32 c_36s = rotl32 ((c_33s ^ c_28s ^ c_22s ^ c_20s), 1u); - const u32 c_37s = rotl32 ((c_34s ^ c_29s ^ c_23s ^ c_21s), 1u); - const u32 c_38s = rotl32 ((c_35s ^ c_30s ^ c_24s ^ c_22s), 1u); - const u32 c_39s = rotl32 ((c_36s ^ c_31s ^ c_25s ^ c_23s), 1u); - const u32 c_40s = rotl32 ((c_37s ^ c_32s ^ c_26s ^ c_24s), 1u); - const u32 c_41s = rotl32 ((c_38s ^ c_33s ^ c_27s ^ c_25s), 1u); - const u32 c_42s = rotl32 ((c_39s ^ c_34s ^ c_28s ^ c_26s), 1u); - const u32 c_43s = rotl32 ((c_40s ^ c_35s ^ c_29s ^ c_27s), 1u); - const u32 c_44s = rotl32 ((c_41s ^ c_36s ^ c_30s ^ c_28s), 1u); - const u32 c_45s = rotl32 ((c_42s ^ c_37s ^ c_31s ^ c_29s), 1u); - const u32 c_46s = rotl32 ((c_43s ^ c_38s ^ c_32s ^ c_30s), 1u); - const u32 c_47s = rotl32 ((c_44s ^ c_39s ^ c_33s ^ c_31s), 1u); - const u32 c_48s = rotl32 ((c_45s ^ c_40s ^ c_34s ^ c_32s), 1u); - const u32 c_49s = rotl32 ((c_46s ^ c_41s ^ c_35s ^ c_33s), 1u); - const u32 c_50s = rotl32 ((c_47s ^ c_42s ^ c_36s ^ c_34s), 1u); - const u32 c_51s = rotl32 ((c_48s ^ c_43s ^ c_37s ^ c_35s), 1u); - const u32 c_52s = rotl32 ((c_49s ^ c_44s ^ c_38s ^ c_36s), 1u); - const u32 c_53s = rotl32 ((c_50s ^ c_45s ^ c_39s ^ c_37s), 1u); - const u32 c_54s = rotl32 ((c_51s ^ c_46s ^ c_40s ^ c_38s), 1u); - const u32 c_55s = rotl32 ((c_52s ^ c_47s ^ c_41s ^ c_39s), 1u); - const u32 c_56s = rotl32 ((c_53s ^ c_48s ^ c_42s ^ c_40s), 1u); - const u32 c_57s = rotl32 ((c_54s ^ c_49s ^ c_43s ^ c_41s), 1u); - const u32 c_58s = rotl32 ((c_55s ^ c_50s ^ c_44s ^ c_42s), 1u); - const u32 c_59s = rotl32 ((c_56s ^ c_51s ^ c_45s ^ c_43s), 1u); - const u32 c_60s = rotl32 ((c_57s ^ c_52s ^ c_46s ^ c_44s), 1u); - const u32 c_61s = rotl32 ((c_58s ^ c_53s ^ c_47s ^ c_45s), 1u); - const u32 c_62s = rotl32 ((c_59s ^ c_54s ^ c_48s ^ c_46s), 1u); - const u32 c_63s = rotl32 ((c_60s ^ c_55s ^ c_49s ^ c_47s), 1u); - const u32 c_64s = rotl32 ((c_61s ^ c_56s ^ c_50s ^ c_48s), 1u); - const u32 c_65s = rotl32 ((c_62s ^ c_57s ^ c_51s ^ c_49s), 1u); - const u32 c_66s = rotl32 ((c_63s ^ c_58s ^ c_52s ^ c_50s), 1u); - const u32 c_67s = rotl32 ((c_64s ^ c_59s ^ c_53s ^ c_51s), 1u); - const u32 c_68s = rotl32 ((c_65s ^ c_60s ^ c_54s ^ c_52s), 1u); - const u32 c_69s = rotl32 ((c_66s ^ c_61s ^ c_55s ^ c_53s), 1u); - const u32 c_70s = rotl32 ((c_67s ^ c_62s ^ c_56s ^ c_54s), 1u); - const u32 c_71s = rotl32 ((c_68s ^ c_63s ^ c_57s ^ c_55s), 1u); - const u32 c_72s = rotl32 ((c_69s ^ c_64s ^ c_58s ^ c_56s), 1u); - const u32 c_73s = rotl32 ((c_70s ^ c_65s ^ c_59s ^ c_57s), 1u); - const u32 c_74s = rotl32 ((c_71s ^ c_66s ^ c_60s ^ c_58s), 1u); - const u32 c_75s = rotl32 ((c_72s ^ c_67s ^ c_61s ^ c_59s), 1u); + const u32 c_16s = rotl32_S ((w[13] ^ w[ 8] ^ w[ 2] ), 1u); + const u32 c_17s = rotl32_S ((w[14] ^ w[ 9] ^ w[ 3] ^ w[ 1]), 1u); + const u32 c_18s = rotl32_S ((w[15] ^ w[10] ^ w[ 4] ^ w[ 2]), 1u); + const u32 c_19s = rotl32_S ((c_16s ^ w[11] ^ w[ 5] ^ w[ 3]), 1u); + const u32 c_20s = rotl32_S ((c_17s ^ w[12] ^ w[ 6] ^ w[ 4]), 1u); + const u32 c_21s = rotl32_S ((c_18s ^ w[13] ^ w[ 7] ^ w[ 5]), 1u); + const u32 c_22s = rotl32_S ((c_19s ^ w[14] ^ w[ 8] ^ w[ 6]), 1u); + const u32 c_23s = rotl32_S ((c_20s ^ w[15] ^ w[ 9] ^ w[ 7]), 1u); + const u32 c_24s = rotl32_S ((c_21s ^ c_16s ^ w[10] ^ w[ 8]), 1u); + const u32 c_25s = rotl32_S ((c_22s ^ c_17s ^ w[11] ^ w[ 9]), 1u); + const u32 c_26s = rotl32_S ((c_23s ^ c_18s ^ w[12] ^ w[10]), 1u); + const u32 c_27s = rotl32_S ((c_24s ^ c_19s ^ w[13] ^ w[11]), 1u); + const u32 c_28s = rotl32_S ((c_25s ^ c_20s ^ w[14] ^ w[12]), 1u); + const u32 c_29s = rotl32_S ((c_26s ^ c_21s ^ w[15] ^ w[13]), 1u); + const u32 c_30s = rotl32_S ((c_27s ^ c_22s ^ c_16s ^ w[14]), 1u); + const u32 c_31s = rotl32_S ((c_28s ^ c_23s ^ c_17s ^ w[15]), 1u); + const u32 c_32s = rotl32_S ((c_29s ^ c_24s ^ c_18s ^ c_16s), 1u); + const u32 c_33s = rotl32_S ((c_30s ^ c_25s ^ c_19s ^ c_17s), 1u); + const u32 c_34s = rotl32_S ((c_31s ^ c_26s ^ c_20s ^ c_18s), 1u); + const u32 c_35s = rotl32_S ((c_32s ^ c_27s ^ c_21s ^ c_19s), 1u); + const u32 c_36s = rotl32_S ((c_33s ^ c_28s ^ c_22s ^ c_20s), 1u); + const u32 c_37s = rotl32_S ((c_34s ^ c_29s ^ c_23s ^ c_21s), 1u); + const u32 c_38s = rotl32_S ((c_35s ^ c_30s ^ c_24s ^ c_22s), 1u); + const u32 c_39s = rotl32_S ((c_36s ^ c_31s ^ c_25s ^ c_23s), 1u); + const u32 c_40s = rotl32_S ((c_37s ^ c_32s ^ c_26s ^ c_24s), 1u); + const u32 c_41s = rotl32_S ((c_38s ^ c_33s ^ c_27s ^ c_25s), 1u); + const u32 c_42s = rotl32_S ((c_39s ^ c_34s ^ c_28s ^ c_26s), 1u); + const u32 c_43s = rotl32_S ((c_40s ^ c_35s ^ c_29s ^ c_27s), 1u); + const u32 c_44s = rotl32_S ((c_41s ^ c_36s ^ c_30s ^ c_28s), 1u); + const u32 c_45s = rotl32_S ((c_42s ^ c_37s ^ c_31s ^ c_29s), 1u); + const u32 c_46s = rotl32_S ((c_43s ^ c_38s ^ c_32s ^ c_30s), 1u); + const u32 c_47s = rotl32_S ((c_44s ^ c_39s ^ c_33s ^ c_31s), 1u); + const u32 c_48s = rotl32_S ((c_45s ^ c_40s ^ c_34s ^ c_32s), 1u); + const u32 c_49s = rotl32_S ((c_46s ^ c_41s ^ c_35s ^ c_33s), 1u); + const u32 c_50s = rotl32_S ((c_47s ^ c_42s ^ c_36s ^ c_34s), 1u); + const u32 c_51s = rotl32_S ((c_48s ^ c_43s ^ c_37s ^ c_35s), 1u); + const u32 c_52s = rotl32_S ((c_49s ^ c_44s ^ c_38s ^ c_36s), 1u); + const u32 c_53s = rotl32_S ((c_50s ^ c_45s ^ c_39s ^ c_37s), 1u); + const u32 c_54s = rotl32_S ((c_51s ^ c_46s ^ c_40s ^ c_38s), 1u); + const u32 c_55s = rotl32_S ((c_52s ^ c_47s ^ c_41s ^ c_39s), 1u); + const u32 c_56s = rotl32_S ((c_53s ^ c_48s ^ c_42s ^ c_40s), 1u); + const u32 c_57s = rotl32_S ((c_54s ^ c_49s ^ c_43s ^ c_41s), 1u); + const u32 c_58s = rotl32_S ((c_55s ^ c_50s ^ c_44s ^ c_42s), 1u); + const u32 c_59s = rotl32_S ((c_56s ^ c_51s ^ c_45s ^ c_43s), 1u); + const u32 c_60s = rotl32_S ((c_57s ^ c_52s ^ c_46s ^ c_44s), 1u); + const u32 c_61s = rotl32_S ((c_58s ^ c_53s ^ c_47s ^ c_45s), 1u); + const u32 c_62s = rotl32_S ((c_59s ^ c_54s ^ c_48s ^ c_46s), 1u); + const u32 c_63s = rotl32_S ((c_60s ^ c_55s ^ c_49s ^ c_47s), 1u); + const u32 c_64s = rotl32_S ((c_61s ^ c_56s ^ c_50s ^ c_48s), 1u); + const u32 c_65s = rotl32_S ((c_62s ^ c_57s ^ c_51s ^ c_49s), 1u); + const u32 c_66s = rotl32_S ((c_63s ^ c_58s ^ c_52s ^ c_50s), 1u); + const u32 c_67s = rotl32_S ((c_64s ^ c_59s ^ c_53s ^ c_51s), 1u); + const u32 c_68s = rotl32_S ((c_65s ^ c_60s ^ c_54s ^ c_52s), 1u); + const u32 c_69s = rotl32_S ((c_66s ^ c_61s ^ c_55s ^ c_53s), 1u); + const u32 c_70s = rotl32_S ((c_67s ^ c_62s ^ c_56s ^ c_54s), 1u); + const u32 c_71s = rotl32_S ((c_68s ^ c_63s ^ c_57s ^ c_55s), 1u); + const u32 c_72s = rotl32_S ((c_69s ^ c_64s ^ c_58s ^ c_56s), 1u); + const u32 c_73s = rotl32_S ((c_70s ^ c_65s ^ c_59s ^ c_57s), 1u); + const u32 c_74s = rotl32_S ((c_71s ^ c_66s ^ c_60s ^ c_58s), 1u); + const u32 c_75s = rotl32_S ((c_72s ^ c_67s ^ c_61s ^ c_59s), 1u); const u32 c_17sK = c_17s + SHA1C00; const u32 c_18sK = c_18s + SHA1C00; @@ -395,43 +381,43 @@ static void m00190s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - const u32 w0s01 = rotl32 (w0, 1u); - const u32 w0s02 = rotl32 (w0, 2u); - const u32 w0s03 = rotl32 (w0, 3u); - const u32 w0s04 = rotl32 (w0, 4u); - const u32 w0s05 = rotl32 (w0, 5u); - const u32 w0s06 = rotl32 (w0, 6u); - const u32 w0s07 = rotl32 (w0, 7u); - const u32 w0s08 = rotl32 (w0, 8u); - const u32 w0s09 = rotl32 (w0, 9u); - const u32 w0s10 = rotl32 (w0, 10u); - const u32 w0s11 = rotl32 (w0, 11u); - const u32 w0s12 = rotl32 (w0, 12u); - const u32 w0s13 = rotl32 (w0, 13u); - const u32 w0s14 = rotl32 (w0, 14u); - const u32 w0s15 = rotl32 (w0, 15u); - const u32 w0s16 = rotl32 (w0, 16u); - const u32 w0s17 = rotl32 (w0, 17u); - const u32 w0s18 = rotl32 (w0, 18u); - const u32 w0s19 = rotl32 (w0, 19u); - const u32 w0s20 = rotl32 (w0, 20u); + const u32x w0s01 = rotl32 (w0, 1u); + const u32x w0s02 = rotl32 (w0, 2u); + const u32x w0s03 = rotl32 (w0, 3u); + const u32x w0s04 = rotl32 (w0, 4u); + const u32x w0s05 = rotl32 (w0, 5u); + const u32x w0s06 = rotl32 (w0, 6u); + const u32x w0s07 = rotl32 (w0, 7u); + const u32x w0s08 = rotl32 (w0, 8u); + const u32x w0s09 = rotl32 (w0, 9u); + const u32x w0s10 = rotl32 (w0, 10u); + const u32x w0s11 = rotl32 (w0, 11u); + const u32x w0s12 = rotl32 (w0, 12u); + const u32x w0s13 = rotl32 (w0, 13u); + const u32x w0s14 = rotl32 (w0, 14u); + const u32x w0s15 = rotl32 (w0, 15u); + const u32x w0s16 = rotl32 (w0, 16u); + const u32x w0s17 = rotl32 (w0, 17u); + const u32x w0s18 = rotl32 (w0, 18u); + const u32x w0s19 = rotl32 (w0, 19u); + const u32x w0s20 = rotl32 (w0, 20u); - const u32 w0s04___w0s06 = w0s04 ^ w0s06; - const u32 w0s04___w0s08 = w0s04 ^ w0s08; - const u32 w0s08___w0s12 = w0s08 ^ w0s12; - const u32 w0s04___w0s06___w0s07 = w0s04___w0s06 ^ w0s07; + const u32x w0s04___w0s06 = w0s04 ^ w0s06; + const u32x w0s04___w0s08 = w0s04 ^ w0s08; + const u32x w0s08___w0s12 = w0s08 ^ w0s12; + const u32x w0s04___w0s06___w0s07 = w0s04___w0s06 ^ w0s07; - u32 a = SHA1M_A; - u32 b = SHA1M_B; - u32 c = SHA1M_C; - u32 d = SHA1M_D; - u32 e = SHA1M_E; + u32x a = SHA1M_A; + u32x b = SHA1M_B; + u32x c = SHA1M_C; + u32x d = SHA1M_D; + u32x e = SHA1M_E; #undef K #define K SHA1C00 @@ -524,16 +510,15 @@ static void m00190s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k SHA1_STEP (SHA1_F1 , d, e, a, b, c, (c_72s ^ w0s05 ^ w0s11 ^ w0s12 ^ w0s13 ^ w0s16 ^ w0s18)); SHA1_STEP (SHA1_F1 , c, d, e, a, b, (c_73s ^ w0s20)); SHA1_STEP (SHA1_F1 , b, c, d, e, a, (c_74s ^ w0s08 ^ w0s16)); + SHA1_STEP (SHA1_F1 , a, b, c, d, e, (c_75s ^ w0s06 ^ w0s12 ^ w0s14)); - SHA1_STEP (SHA1_F1, a, b, c, d, e, (c_75s ^ w0s06 ^ w0s12 ^ w0s14)); + const u32x c_76s = rotl32 ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u); + const u32x c_77s = rotl32 ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u); + const u32x c_78s = rotl32 ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u); + const u32x c_79s = rotl32 ((c_76s ^ c_71s ^ c_65s ^ c_63s), 1u); - const u32 c_76s = rotl32 ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u); - const u32 c_77s = rotl32 ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u); - const u32 c_78s = rotl32 ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u); - const u32 c_79s = rotl32 ((c_76s ^ c_71s ^ c_65s ^ c_63s), 1u); - - const u32 w0s21 = rotl32 (w0, 21u); - const u32 w0s22 = rotl32 (w0, 22U); + const u32x w0s21 = rotl32 (w0, 21u); + const u32x w0s22 = rotl32 (w0, 22U); SHA1_STEP (SHA1_F1 , e, a, b, c, d, (c_76s ^ w0s07 ^ w0s08___w0s12 ^ w0s16 ^ w0s21)); SHA1_STEP (SHA1_F1 , d, e, a, b, c, (c_77s)); @@ -545,29 +530,15 @@ static void m00190s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k d += SHA1M_D; c += SHA1M_C; - { - const u32 r0 = a; - const u32 r1 = e; - const u32 r2 = d; - const u32 r3 = c; - - #include COMPARE_S - } + COMPARE_S_SIMD (a, e, d, c); a &= 0x00000fff; - { - const u32 r0 = a; - const u32 r1 = e; - const u32 r2 = d; - const u32 r3 = c; - - #include COMPARE_S - } + COMPARE_S_SIMD (a, e, d, c); } } -__kernel void m00190_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m00190_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -605,7 +576,7 @@ __kernel void m00190_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m00190m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m00190_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m00190_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -643,7 +614,7 @@ __kernel void m00190_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m00190m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m00190_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m00190_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -681,7 +652,7 @@ __kernel void m00190_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m00190m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m00190_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m00190_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -719,7 +690,7 @@ __kernel void m00190_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m00190s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m00190_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m00190_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -757,7 +728,7 @@ __kernel void m00190_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m00190s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m00190_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m00190_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base diff --git a/OpenCL/m00200_a1.cl b/OpenCL/m00200_a1.cl index a5a030331..c9e224263 100644 --- a/OpenCL/m00200_a1.cl +++ b/OpenCL/m00200_a1.cl @@ -68,7 +68,7 @@ __kernel void m00200_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -111,7 +111,7 @@ __kernel void m00200_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w_t[16]; @@ -246,7 +246,7 @@ __kernel void m00200_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -301,7 +301,7 @@ __kernel void m00200_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w_t[16]; diff --git a/OpenCL/m00200_a3.cl b/OpenCL/m00200_a3.cl index 5822e581a..24cc92e89 100644 --- a/OpenCL/m00200_a3.cl +++ b/OpenCL/m00200_a3.cl @@ -5,6 +5,8 @@ #define _MYSQL323_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,11 +18,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" +#include "OpenCL/simd.c" -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" - -static void m00200m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m00200m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -35,16 +35,18 @@ static void m00200m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - u32 a = MYSQL323_A; - u32 b = MYSQL323_B; + u32x a = MYSQL323_A; + u32x b = MYSQL323_B; + u32x c = 0; + u32x d = 0; - u32 add = 7; + u32x add = 7; #define ROUND(v) \ { \ @@ -81,7 +83,7 @@ static void m00200m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k for (i = 4, j = 1; i <= (int) pw_len - 4; i += 4, j += 1) { - const u32 wj = w[j]; + const u32x wj = w[j]; ROUND ((wj >> 0) & 0xff); ROUND ((wj >> 8) & 0xff); @@ -89,7 +91,7 @@ static void m00200m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k ROUND ((wj >> 24) & 0xff); } - const u32 wj = w[j]; + const u32x wj = w[j]; const u32 left = pw_len - i; @@ -112,16 +114,11 @@ static void m00200m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k a &= 0x7fffffff; b &= 0x7fffffff; - const u32 r0 = a; - const u32 r1 = b; - const u32 r2 = 0; - const u32 r3 = 0; - - #include COMPARE_M + COMPARE_M_SIMD (a, b, c, d); } } -static void m00200s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m00200s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -148,16 +145,18 @@ static void m00200s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - u32 a = MYSQL323_A; - u32 b = MYSQL323_B; + u32x a = MYSQL323_A; + u32x b = MYSQL323_B; + u32x c = 0; + u32x d = 0; - u32 add = 7; + u32x add = 7; #define ROUND(v) \ { \ @@ -194,7 +193,7 @@ static void m00200s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k for (i = 4, j = 1; i <= (int) pw_len - 4; i += 4, j += 1) { - const u32 wj = w[j]; + const u32x wj = w[j]; ROUND ((wj >> 0) & 0xff); ROUND ((wj >> 8) & 0xff); @@ -202,7 +201,7 @@ static void m00200s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k ROUND ((wj >> 24) & 0xff); } - const u32 wj = w[j]; + const u32x wj = w[j]; const u32 left = pw_len - i; @@ -225,16 +224,11 @@ static void m00200s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k a &= 0x7fffffff; b &= 0x7fffffff; - const u32 r0 = a; - const u32 r1 = b; - const u32 r2 = 0; - const u32 r3 = 0; - - #include COMPARE_S + COMPARE_S_SIMD (a, b, c, d); } } -__kernel void m00200_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m00200_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -272,7 +266,7 @@ __kernel void m00200_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m00200m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m00200_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m00200_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -310,7 +304,7 @@ __kernel void m00200_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m00200m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m00200_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m00200_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -348,7 +342,7 @@ __kernel void m00200_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m00200m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m00200_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m00200_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -386,7 +380,7 @@ __kernel void m00200_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m00200s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m00200_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m00200_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -424,7 +418,7 @@ __kernel void m00200_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m00200s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m00200_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m00200_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base diff --git a/OpenCL/m00300_a1.cl b/OpenCL/m00300_a1.cl index f78d38585..ca70b9f11 100644 --- a/OpenCL/m00300_a1.cl +++ b/OpenCL/m00300_a1.cl @@ -70,7 +70,7 @@ __kernel void m00300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -115,7 +115,7 @@ __kernel void m00300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordr0, wordr1, pw_r_len); - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -462,7 +462,7 @@ __kernel void m00300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -525,7 +525,7 @@ __kernel void m00300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordr0, wordr1, pw_r_len); - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m00300_a3.cl b/OpenCL/m00300_a3.cl index e5d6782e8..83b32198a 100644 --- a/OpenCL/m00300_a3.cl +++ b/OpenCL/m00300_a3.cl @@ -5,6 +5,8 @@ #define _SHA1_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,11 +18,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" +#include "OpenCL/simd.c" -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" - -static void m00300m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m00300m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -33,70 +33,70 @@ static void m00300m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k * base */ - const u32 c_16s = rotl32 ((w[13] ^ w[ 8] ^ w[ 2] ), 1u); - const u32 c_17s = rotl32 ((w[14] ^ w[ 9] ^ w[ 3] ^ w[ 1]), 1u); - const u32 c_18s = rotl32 ((w[15] ^ w[10] ^ w[ 4] ^ w[ 2]), 1u); - const u32 c_19s = rotl32 ((c_16s ^ w[11] ^ w[ 5] ^ w[ 3]), 1u); - const u32 c_20s = rotl32 ((c_17s ^ w[12] ^ w[ 6] ^ w[ 4]), 1u); - const u32 c_21s = rotl32 ((c_18s ^ w[13] ^ w[ 7] ^ w[ 5]), 1u); - const u32 c_22s = rotl32 ((c_19s ^ w[14] ^ w[ 8] ^ w[ 6]), 1u); - const u32 c_23s = rotl32 ((c_20s ^ w[15] ^ w[ 9] ^ w[ 7]), 1u); - const u32 c_24s = rotl32 ((c_21s ^ c_16s ^ w[10] ^ w[ 8]), 1u); - const u32 c_25s = rotl32 ((c_22s ^ c_17s ^ w[11] ^ w[ 9]), 1u); - const u32 c_26s = rotl32 ((c_23s ^ c_18s ^ w[12] ^ w[10]), 1u); - const u32 c_27s = rotl32 ((c_24s ^ c_19s ^ w[13] ^ w[11]), 1u); - const u32 c_28s = rotl32 ((c_25s ^ c_20s ^ w[14] ^ w[12]), 1u); - const u32 c_29s = rotl32 ((c_26s ^ c_21s ^ w[15] ^ w[13]), 1u); - const u32 c_30s = rotl32 ((c_27s ^ c_22s ^ c_16s ^ w[14]), 1u); - const u32 c_31s = rotl32 ((c_28s ^ c_23s ^ c_17s ^ w[15]), 1u); - const u32 c_32s = rotl32 ((c_29s ^ c_24s ^ c_18s ^ c_16s), 1u); - const u32 c_33s = rotl32 ((c_30s ^ c_25s ^ c_19s ^ c_17s), 1u); - const u32 c_34s = rotl32 ((c_31s ^ c_26s ^ c_20s ^ c_18s), 1u); - const u32 c_35s = rotl32 ((c_32s ^ c_27s ^ c_21s ^ c_19s), 1u); - const u32 c_36s = rotl32 ((c_33s ^ c_28s ^ c_22s ^ c_20s), 1u); - const u32 c_37s = rotl32 ((c_34s ^ c_29s ^ c_23s ^ c_21s), 1u); - const u32 c_38s = rotl32 ((c_35s ^ c_30s ^ c_24s ^ c_22s), 1u); - const u32 c_39s = rotl32 ((c_36s ^ c_31s ^ c_25s ^ c_23s), 1u); - const u32 c_40s = rotl32 ((c_37s ^ c_32s ^ c_26s ^ c_24s), 1u); - const u32 c_41s = rotl32 ((c_38s ^ c_33s ^ c_27s ^ c_25s), 1u); - const u32 c_42s = rotl32 ((c_39s ^ c_34s ^ c_28s ^ c_26s), 1u); - const u32 c_43s = rotl32 ((c_40s ^ c_35s ^ c_29s ^ c_27s), 1u); - const u32 c_44s = rotl32 ((c_41s ^ c_36s ^ c_30s ^ c_28s), 1u); - const u32 c_45s = rotl32 ((c_42s ^ c_37s ^ c_31s ^ c_29s), 1u); - const u32 c_46s = rotl32 ((c_43s ^ c_38s ^ c_32s ^ c_30s), 1u); - const u32 c_47s = rotl32 ((c_44s ^ c_39s ^ c_33s ^ c_31s), 1u); - const u32 c_48s = rotl32 ((c_45s ^ c_40s ^ c_34s ^ c_32s), 1u); - const u32 c_49s = rotl32 ((c_46s ^ c_41s ^ c_35s ^ c_33s), 1u); - const u32 c_50s = rotl32 ((c_47s ^ c_42s ^ c_36s ^ c_34s), 1u); - const u32 c_51s = rotl32 ((c_48s ^ c_43s ^ c_37s ^ c_35s), 1u); - const u32 c_52s = rotl32 ((c_49s ^ c_44s ^ c_38s ^ c_36s), 1u); - const u32 c_53s = rotl32 ((c_50s ^ c_45s ^ c_39s ^ c_37s), 1u); - const u32 c_54s = rotl32 ((c_51s ^ c_46s ^ c_40s ^ c_38s), 1u); - const u32 c_55s = rotl32 ((c_52s ^ c_47s ^ c_41s ^ c_39s), 1u); - const u32 c_56s = rotl32 ((c_53s ^ c_48s ^ c_42s ^ c_40s), 1u); - const u32 c_57s = rotl32 ((c_54s ^ c_49s ^ c_43s ^ c_41s), 1u); - const u32 c_58s = rotl32 ((c_55s ^ c_50s ^ c_44s ^ c_42s), 1u); - const u32 c_59s = rotl32 ((c_56s ^ c_51s ^ c_45s ^ c_43s), 1u); - const u32 c_60s = rotl32 ((c_57s ^ c_52s ^ c_46s ^ c_44s), 1u); - const u32 c_61s = rotl32 ((c_58s ^ c_53s ^ c_47s ^ c_45s), 1u); - const u32 c_62s = rotl32 ((c_59s ^ c_54s ^ c_48s ^ c_46s), 1u); - const u32 c_63s = rotl32 ((c_60s ^ c_55s ^ c_49s ^ c_47s), 1u); - const u32 c_64s = rotl32 ((c_61s ^ c_56s ^ c_50s ^ c_48s), 1u); - const u32 c_65s = rotl32 ((c_62s ^ c_57s ^ c_51s ^ c_49s), 1u); - const u32 c_66s = rotl32 ((c_63s ^ c_58s ^ c_52s ^ c_50s), 1u); - const u32 c_67s = rotl32 ((c_64s ^ c_59s ^ c_53s ^ c_51s), 1u); - const u32 c_68s = rotl32 ((c_65s ^ c_60s ^ c_54s ^ c_52s), 1u); - const u32 c_69s = rotl32 ((c_66s ^ c_61s ^ c_55s ^ c_53s), 1u); - const u32 c_70s = rotl32 ((c_67s ^ c_62s ^ c_56s ^ c_54s), 1u); - const u32 c_71s = rotl32 ((c_68s ^ c_63s ^ c_57s ^ c_55s), 1u); - const u32 c_72s = rotl32 ((c_69s ^ c_64s ^ c_58s ^ c_56s), 1u); - const u32 c_73s = rotl32 ((c_70s ^ c_65s ^ c_59s ^ c_57s), 1u); - const u32 c_74s = rotl32 ((c_71s ^ c_66s ^ c_60s ^ c_58s), 1u); - const u32 c_75s = rotl32 ((c_72s ^ c_67s ^ c_61s ^ c_59s), 1u); - const u32 c_76s = rotl32 ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u); - const u32 c_77s = rotl32 ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u); - const u32 c_78s = rotl32 ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u); - const u32 c_79s = rotl32 ((c_76s ^ c_71s ^ c_65s ^ c_63s), 1u); + const u32 c_16s = rotl32_S ((w[13] ^ w[ 8] ^ w[ 2] ), 1u); + const u32 c_17s = rotl32_S ((w[14] ^ w[ 9] ^ w[ 3] ^ w[ 1]), 1u); + const u32 c_18s = rotl32_S ((w[15] ^ w[10] ^ w[ 4] ^ w[ 2]), 1u); + const u32 c_19s = rotl32_S ((c_16s ^ w[11] ^ w[ 5] ^ w[ 3]), 1u); + const u32 c_20s = rotl32_S ((c_17s ^ w[12] ^ w[ 6] ^ w[ 4]), 1u); + const u32 c_21s = rotl32_S ((c_18s ^ w[13] ^ w[ 7] ^ w[ 5]), 1u); + const u32 c_22s = rotl32_S ((c_19s ^ w[14] ^ w[ 8] ^ w[ 6]), 1u); + const u32 c_23s = rotl32_S ((c_20s ^ w[15] ^ w[ 9] ^ w[ 7]), 1u); + const u32 c_24s = rotl32_S ((c_21s ^ c_16s ^ w[10] ^ w[ 8]), 1u); + const u32 c_25s = rotl32_S ((c_22s ^ c_17s ^ w[11] ^ w[ 9]), 1u); + const u32 c_26s = rotl32_S ((c_23s ^ c_18s ^ w[12] ^ w[10]), 1u); + const u32 c_27s = rotl32_S ((c_24s ^ c_19s ^ w[13] ^ w[11]), 1u); + const u32 c_28s = rotl32_S ((c_25s ^ c_20s ^ w[14] ^ w[12]), 1u); + const u32 c_29s = rotl32_S ((c_26s ^ c_21s ^ w[15] ^ w[13]), 1u); + const u32 c_30s = rotl32_S ((c_27s ^ c_22s ^ c_16s ^ w[14]), 1u); + const u32 c_31s = rotl32_S ((c_28s ^ c_23s ^ c_17s ^ w[15]), 1u); + const u32 c_32s = rotl32_S ((c_29s ^ c_24s ^ c_18s ^ c_16s), 1u); + const u32 c_33s = rotl32_S ((c_30s ^ c_25s ^ c_19s ^ c_17s), 1u); + const u32 c_34s = rotl32_S ((c_31s ^ c_26s ^ c_20s ^ c_18s), 1u); + const u32 c_35s = rotl32_S ((c_32s ^ c_27s ^ c_21s ^ c_19s), 1u); + const u32 c_36s = rotl32_S ((c_33s ^ c_28s ^ c_22s ^ c_20s), 1u); + const u32 c_37s = rotl32_S ((c_34s ^ c_29s ^ c_23s ^ c_21s), 1u); + const u32 c_38s = rotl32_S ((c_35s ^ c_30s ^ c_24s ^ c_22s), 1u); + const u32 c_39s = rotl32_S ((c_36s ^ c_31s ^ c_25s ^ c_23s), 1u); + const u32 c_40s = rotl32_S ((c_37s ^ c_32s ^ c_26s ^ c_24s), 1u); + const u32 c_41s = rotl32_S ((c_38s ^ c_33s ^ c_27s ^ c_25s), 1u); + const u32 c_42s = rotl32_S ((c_39s ^ c_34s ^ c_28s ^ c_26s), 1u); + const u32 c_43s = rotl32_S ((c_40s ^ c_35s ^ c_29s ^ c_27s), 1u); + const u32 c_44s = rotl32_S ((c_41s ^ c_36s ^ c_30s ^ c_28s), 1u); + const u32 c_45s = rotl32_S ((c_42s ^ c_37s ^ c_31s ^ c_29s), 1u); + const u32 c_46s = rotl32_S ((c_43s ^ c_38s ^ c_32s ^ c_30s), 1u); + const u32 c_47s = rotl32_S ((c_44s ^ c_39s ^ c_33s ^ c_31s), 1u); + const u32 c_48s = rotl32_S ((c_45s ^ c_40s ^ c_34s ^ c_32s), 1u); + const u32 c_49s = rotl32_S ((c_46s ^ c_41s ^ c_35s ^ c_33s), 1u); + const u32 c_50s = rotl32_S ((c_47s ^ c_42s ^ c_36s ^ c_34s), 1u); + const u32 c_51s = rotl32_S ((c_48s ^ c_43s ^ c_37s ^ c_35s), 1u); + const u32 c_52s = rotl32_S ((c_49s ^ c_44s ^ c_38s ^ c_36s), 1u); + const u32 c_53s = rotl32_S ((c_50s ^ c_45s ^ c_39s ^ c_37s), 1u); + const u32 c_54s = rotl32_S ((c_51s ^ c_46s ^ c_40s ^ c_38s), 1u); + const u32 c_55s = rotl32_S ((c_52s ^ c_47s ^ c_41s ^ c_39s), 1u); + const u32 c_56s = rotl32_S ((c_53s ^ c_48s ^ c_42s ^ c_40s), 1u); + const u32 c_57s = rotl32_S ((c_54s ^ c_49s ^ c_43s ^ c_41s), 1u); + const u32 c_58s = rotl32_S ((c_55s ^ c_50s ^ c_44s ^ c_42s), 1u); + const u32 c_59s = rotl32_S ((c_56s ^ c_51s ^ c_45s ^ c_43s), 1u); + const u32 c_60s = rotl32_S ((c_57s ^ c_52s ^ c_46s ^ c_44s), 1u); + const u32 c_61s = rotl32_S ((c_58s ^ c_53s ^ c_47s ^ c_45s), 1u); + const u32 c_62s = rotl32_S ((c_59s ^ c_54s ^ c_48s ^ c_46s), 1u); + const u32 c_63s = rotl32_S ((c_60s ^ c_55s ^ c_49s ^ c_47s), 1u); + const u32 c_64s = rotl32_S ((c_61s ^ c_56s ^ c_50s ^ c_48s), 1u); + const u32 c_65s = rotl32_S ((c_62s ^ c_57s ^ c_51s ^ c_49s), 1u); + const u32 c_66s = rotl32_S ((c_63s ^ c_58s ^ c_52s ^ c_50s), 1u); + const u32 c_67s = rotl32_S ((c_64s ^ c_59s ^ c_53s ^ c_51s), 1u); + const u32 c_68s = rotl32_S ((c_65s ^ c_60s ^ c_54s ^ c_52s), 1u); + const u32 c_69s = rotl32_S ((c_66s ^ c_61s ^ c_55s ^ c_53s), 1u); + const u32 c_70s = rotl32_S ((c_67s ^ c_62s ^ c_56s ^ c_54s), 1u); + const u32 c_71s = rotl32_S ((c_68s ^ c_63s ^ c_57s ^ c_55s), 1u); + const u32 c_72s = rotl32_S ((c_69s ^ c_64s ^ c_58s ^ c_56s), 1u); + const u32 c_73s = rotl32_S ((c_70s ^ c_65s ^ c_59s ^ c_57s), 1u); + const u32 c_74s = rotl32_S ((c_71s ^ c_66s ^ c_60s ^ c_58s), 1u); + const u32 c_75s = rotl32_S ((c_72s ^ c_67s ^ c_61s ^ c_59s), 1u); + const u32 c_76s = rotl32_S ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u); + const u32 c_77s = rotl32_S ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u); + const u32 c_78s = rotl32_S ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u); + const u32 c_79s = rotl32_S ((c_76s ^ c_71s ^ c_65s ^ c_63s), 1u); const u32 c_17sK = c_17s + SHA1C00; const u32 c_18sK = c_18s + SHA1C00; @@ -136,45 +136,45 @@ static void m00300m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - const u32 w0s01 = rotl32 (w0, 1u); - const u32 w0s02 = rotl32 (w0, 2u); - const u32 w0s03 = rotl32 (w0, 3u); - const u32 w0s04 = rotl32 (w0, 4u); - const u32 w0s05 = rotl32 (w0, 5u); - const u32 w0s06 = rotl32 (w0, 6u); - const u32 w0s07 = rotl32 (w0, 7u); - const u32 w0s08 = rotl32 (w0, 8u); - const u32 w0s09 = rotl32 (w0, 9u); - const u32 w0s10 = rotl32 (w0, 10u); - const u32 w0s11 = rotl32 (w0, 11u); - const u32 w0s12 = rotl32 (w0, 12u); - const u32 w0s13 = rotl32 (w0, 13u); - const u32 w0s14 = rotl32 (w0, 14u); - const u32 w0s15 = rotl32 (w0, 15u); - const u32 w0s16 = rotl32 (w0, 16u); - const u32 w0s17 = rotl32 (w0, 17u); - const u32 w0s18 = rotl32 (w0, 18u); - const u32 w0s19 = rotl32 (w0, 19u); - const u32 w0s20 = rotl32 (w0, 20u); - const u32 w0s21 = rotl32 (w0, 21u); - const u32 w0s22 = rotl32 (w0, 22U); + const u32x w0s01 = rotl32 (w0, 1u); + const u32x w0s02 = rotl32 (w0, 2u); + const u32x w0s03 = rotl32 (w0, 3u); + const u32x w0s04 = rotl32 (w0, 4u); + const u32x w0s05 = rotl32 (w0, 5u); + const u32x w0s06 = rotl32 (w0, 6u); + const u32x w0s07 = rotl32 (w0, 7u); + const u32x w0s08 = rotl32 (w0, 8u); + const u32x w0s09 = rotl32 (w0, 9u); + const u32x w0s10 = rotl32 (w0, 10u); + const u32x w0s11 = rotl32 (w0, 11u); + const u32x w0s12 = rotl32 (w0, 12u); + const u32x w0s13 = rotl32 (w0, 13u); + const u32x w0s14 = rotl32 (w0, 14u); + const u32x w0s15 = rotl32 (w0, 15u); + const u32x w0s16 = rotl32 (w0, 16u); + const u32x w0s17 = rotl32 (w0, 17u); + const u32x w0s18 = rotl32 (w0, 18u); + const u32x w0s19 = rotl32 (w0, 19u); + const u32x w0s20 = rotl32 (w0, 20u); + const u32x w0s21 = rotl32 (w0, 21u); + const u32x w0s22 = rotl32 (w0, 22U); - const u32 w0s04___w0s06 = w0s04 ^ w0s06; - const u32 w0s04___w0s08 = w0s04 ^ w0s08; - const u32 w0s08___w0s12 = w0s08 ^ w0s12; - const u32 w0s04___w0s06___w0s07 = w0s04___w0s06 ^ w0s07; + const u32x w0s04___w0s06 = w0s04 ^ w0s06; + const u32x w0s04___w0s08 = w0s04 ^ w0s08; + const u32x w0s08___w0s12 = w0s08 ^ w0s12; + const u32x w0s04___w0s06___w0s07 = w0s04___w0s06 ^ w0s07; - u32 a = SHA1M_A; - u32 b = SHA1M_B; - u32 c = SHA1M_C; - u32 d = SHA1M_D; - u32 e = SHA1M_E; + u32x a = SHA1M_A; + u32x b = SHA1M_B; + u32x c = SHA1M_C; + u32x d = SHA1M_D; + u32x e = SHA1M_E; #undef K #define K SHA1C00 @@ -279,22 +279,22 @@ static void m00300m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k d += SHA1M_D; e += SHA1M_E; - u32 w0_t = a; - u32 w1_t = b; - u32 w2_t = c; - u32 w3_t = d; - u32 w4_t = e; - u32 w5_t = 0x80000000; - u32 w6_t = 0; - u32 w7_t = 0; - u32 w8_t = 0; - u32 w9_t = 0; - u32 wa_t = 0; - u32 wb_t = 0; - u32 wc_t = 0; - u32 wd_t = 0; - u32 we_t = 0; - u32 wf_t = 20 * 8; + u32x w0_t = a; + u32x w1_t = b; + u32x w2_t = c; + u32x w3_t = d; + u32x w4_t = e; + u32x w5_t = 0x80000000; + u32x w6_t = 0; + u32x w7_t = 0; + u32x w8_t = 0; + u32x w9_t = 0; + u32x wa_t = 0; + u32x wb_t = 0; + u32x wc_t = 0; + u32x wd_t = 0; + u32x we_t = 0; + u32x wf_t = 20 * 8; a = SHA1M_A; b = SHA1M_B; @@ -398,17 +398,11 @@ static void m00300m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we_t); wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf_t); - - const u32 r0 = d; - const u32 r1 = e; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_M + COMPARE_M_SIMD (d, e, c, b); } } -static void m00300s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m00300s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -421,66 +415,66 @@ static void m00300s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k * base */ - const u32 c_16s = rotl32 ((w[13] ^ w[ 8] ^ w[ 2] ), 1u); - const u32 c_17s = rotl32 ((w[14] ^ w[ 9] ^ w[ 3] ^ w[ 1]), 1u); - const u32 c_18s = rotl32 ((w[15] ^ w[10] ^ w[ 4] ^ w[ 2]), 1u); - const u32 c_19s = rotl32 ((c_16s ^ w[11] ^ w[ 5] ^ w[ 3]), 1u); - const u32 c_20s = rotl32 ((c_17s ^ w[12] ^ w[ 6] ^ w[ 4]), 1u); - const u32 c_21s = rotl32 ((c_18s ^ w[13] ^ w[ 7] ^ w[ 5]), 1u); - const u32 c_22s = rotl32 ((c_19s ^ w[14] ^ w[ 8] ^ w[ 6]), 1u); - const u32 c_23s = rotl32 ((c_20s ^ w[15] ^ w[ 9] ^ w[ 7]), 1u); - const u32 c_24s = rotl32 ((c_21s ^ c_16s ^ w[10] ^ w[ 8]), 1u); - const u32 c_25s = rotl32 ((c_22s ^ c_17s ^ w[11] ^ w[ 9]), 1u); - const u32 c_26s = rotl32 ((c_23s ^ c_18s ^ w[12] ^ w[10]), 1u); - const u32 c_27s = rotl32 ((c_24s ^ c_19s ^ w[13] ^ w[11]), 1u); - const u32 c_28s = rotl32 ((c_25s ^ c_20s ^ w[14] ^ w[12]), 1u); - const u32 c_29s = rotl32 ((c_26s ^ c_21s ^ w[15] ^ w[13]), 1u); - const u32 c_30s = rotl32 ((c_27s ^ c_22s ^ c_16s ^ w[14]), 1u); - const u32 c_31s = rotl32 ((c_28s ^ c_23s ^ c_17s ^ w[15]), 1u); - const u32 c_32s = rotl32 ((c_29s ^ c_24s ^ c_18s ^ c_16s), 1u); - const u32 c_33s = rotl32 ((c_30s ^ c_25s ^ c_19s ^ c_17s), 1u); - const u32 c_34s = rotl32 ((c_31s ^ c_26s ^ c_20s ^ c_18s), 1u); - const u32 c_35s = rotl32 ((c_32s ^ c_27s ^ c_21s ^ c_19s), 1u); - const u32 c_36s = rotl32 ((c_33s ^ c_28s ^ c_22s ^ c_20s), 1u); - const u32 c_37s = rotl32 ((c_34s ^ c_29s ^ c_23s ^ c_21s), 1u); - const u32 c_38s = rotl32 ((c_35s ^ c_30s ^ c_24s ^ c_22s), 1u); - const u32 c_39s = rotl32 ((c_36s ^ c_31s ^ c_25s ^ c_23s), 1u); - const u32 c_40s = rotl32 ((c_37s ^ c_32s ^ c_26s ^ c_24s), 1u); - const u32 c_41s = rotl32 ((c_38s ^ c_33s ^ c_27s ^ c_25s), 1u); - const u32 c_42s = rotl32 ((c_39s ^ c_34s ^ c_28s ^ c_26s), 1u); - const u32 c_43s = rotl32 ((c_40s ^ c_35s ^ c_29s ^ c_27s), 1u); - const u32 c_44s = rotl32 ((c_41s ^ c_36s ^ c_30s ^ c_28s), 1u); - const u32 c_45s = rotl32 ((c_42s ^ c_37s ^ c_31s ^ c_29s), 1u); - const u32 c_46s = rotl32 ((c_43s ^ c_38s ^ c_32s ^ c_30s), 1u); - const u32 c_47s = rotl32 ((c_44s ^ c_39s ^ c_33s ^ c_31s), 1u); - const u32 c_48s = rotl32 ((c_45s ^ c_40s ^ c_34s ^ c_32s), 1u); - const u32 c_49s = rotl32 ((c_46s ^ c_41s ^ c_35s ^ c_33s), 1u); - const u32 c_50s = rotl32 ((c_47s ^ c_42s ^ c_36s ^ c_34s), 1u); - const u32 c_51s = rotl32 ((c_48s ^ c_43s ^ c_37s ^ c_35s), 1u); - const u32 c_52s = rotl32 ((c_49s ^ c_44s ^ c_38s ^ c_36s), 1u); - const u32 c_53s = rotl32 ((c_50s ^ c_45s ^ c_39s ^ c_37s), 1u); - const u32 c_54s = rotl32 ((c_51s ^ c_46s ^ c_40s ^ c_38s), 1u); - const u32 c_55s = rotl32 ((c_52s ^ c_47s ^ c_41s ^ c_39s), 1u); - const u32 c_56s = rotl32 ((c_53s ^ c_48s ^ c_42s ^ c_40s), 1u); - const u32 c_57s = rotl32 ((c_54s ^ c_49s ^ c_43s ^ c_41s), 1u); - const u32 c_58s = rotl32 ((c_55s ^ c_50s ^ c_44s ^ c_42s), 1u); - const u32 c_59s = rotl32 ((c_56s ^ c_51s ^ c_45s ^ c_43s), 1u); - const u32 c_60s = rotl32 ((c_57s ^ c_52s ^ c_46s ^ c_44s), 1u); - const u32 c_61s = rotl32 ((c_58s ^ c_53s ^ c_47s ^ c_45s), 1u); - const u32 c_62s = rotl32 ((c_59s ^ c_54s ^ c_48s ^ c_46s), 1u); - const u32 c_63s = rotl32 ((c_60s ^ c_55s ^ c_49s ^ c_47s), 1u); - const u32 c_64s = rotl32 ((c_61s ^ c_56s ^ c_50s ^ c_48s), 1u); - const u32 c_65s = rotl32 ((c_62s ^ c_57s ^ c_51s ^ c_49s), 1u); - const u32 c_66s = rotl32 ((c_63s ^ c_58s ^ c_52s ^ c_50s), 1u); - const u32 c_67s = rotl32 ((c_64s ^ c_59s ^ c_53s ^ c_51s), 1u); - const u32 c_68s = rotl32 ((c_65s ^ c_60s ^ c_54s ^ c_52s), 1u); - const u32 c_69s = rotl32 ((c_66s ^ c_61s ^ c_55s ^ c_53s), 1u); - const u32 c_70s = rotl32 ((c_67s ^ c_62s ^ c_56s ^ c_54s), 1u); - const u32 c_71s = rotl32 ((c_68s ^ c_63s ^ c_57s ^ c_55s), 1u); - const u32 c_72s = rotl32 ((c_69s ^ c_64s ^ c_58s ^ c_56s), 1u); - const u32 c_73s = rotl32 ((c_70s ^ c_65s ^ c_59s ^ c_57s), 1u); - const u32 c_74s = rotl32 ((c_71s ^ c_66s ^ c_60s ^ c_58s), 1u); - const u32 c_75s = rotl32 ((c_72s ^ c_67s ^ c_61s ^ c_59s), 1u); + const u32 c_16s = rotl32_S ((w[13] ^ w[ 8] ^ w[ 2] ), 1u); + const u32 c_17s = rotl32_S ((w[14] ^ w[ 9] ^ w[ 3] ^ w[ 1]), 1u); + const u32 c_18s = rotl32_S ((w[15] ^ w[10] ^ w[ 4] ^ w[ 2]), 1u); + const u32 c_19s = rotl32_S ((c_16s ^ w[11] ^ w[ 5] ^ w[ 3]), 1u); + const u32 c_20s = rotl32_S ((c_17s ^ w[12] ^ w[ 6] ^ w[ 4]), 1u); + const u32 c_21s = rotl32_S ((c_18s ^ w[13] ^ w[ 7] ^ w[ 5]), 1u); + const u32 c_22s = rotl32_S ((c_19s ^ w[14] ^ w[ 8] ^ w[ 6]), 1u); + const u32 c_23s = rotl32_S ((c_20s ^ w[15] ^ w[ 9] ^ w[ 7]), 1u); + const u32 c_24s = rotl32_S ((c_21s ^ c_16s ^ w[10] ^ w[ 8]), 1u); + const u32 c_25s = rotl32_S ((c_22s ^ c_17s ^ w[11] ^ w[ 9]), 1u); + const u32 c_26s = rotl32_S ((c_23s ^ c_18s ^ w[12] ^ w[10]), 1u); + const u32 c_27s = rotl32_S ((c_24s ^ c_19s ^ w[13] ^ w[11]), 1u); + const u32 c_28s = rotl32_S ((c_25s ^ c_20s ^ w[14] ^ w[12]), 1u); + const u32 c_29s = rotl32_S ((c_26s ^ c_21s ^ w[15] ^ w[13]), 1u); + const u32 c_30s = rotl32_S ((c_27s ^ c_22s ^ c_16s ^ w[14]), 1u); + const u32 c_31s = rotl32_S ((c_28s ^ c_23s ^ c_17s ^ w[15]), 1u); + const u32 c_32s = rotl32_S ((c_29s ^ c_24s ^ c_18s ^ c_16s), 1u); + const u32 c_33s = rotl32_S ((c_30s ^ c_25s ^ c_19s ^ c_17s), 1u); + const u32 c_34s = rotl32_S ((c_31s ^ c_26s ^ c_20s ^ c_18s), 1u); + const u32 c_35s = rotl32_S ((c_32s ^ c_27s ^ c_21s ^ c_19s), 1u); + const u32 c_36s = rotl32_S ((c_33s ^ c_28s ^ c_22s ^ c_20s), 1u); + const u32 c_37s = rotl32_S ((c_34s ^ c_29s ^ c_23s ^ c_21s), 1u); + const u32 c_38s = rotl32_S ((c_35s ^ c_30s ^ c_24s ^ c_22s), 1u); + const u32 c_39s = rotl32_S ((c_36s ^ c_31s ^ c_25s ^ c_23s), 1u); + const u32 c_40s = rotl32_S ((c_37s ^ c_32s ^ c_26s ^ c_24s), 1u); + const u32 c_41s = rotl32_S ((c_38s ^ c_33s ^ c_27s ^ c_25s), 1u); + const u32 c_42s = rotl32_S ((c_39s ^ c_34s ^ c_28s ^ c_26s), 1u); + const u32 c_43s = rotl32_S ((c_40s ^ c_35s ^ c_29s ^ c_27s), 1u); + const u32 c_44s = rotl32_S ((c_41s ^ c_36s ^ c_30s ^ c_28s), 1u); + const u32 c_45s = rotl32_S ((c_42s ^ c_37s ^ c_31s ^ c_29s), 1u); + const u32 c_46s = rotl32_S ((c_43s ^ c_38s ^ c_32s ^ c_30s), 1u); + const u32 c_47s = rotl32_S ((c_44s ^ c_39s ^ c_33s ^ c_31s), 1u); + const u32 c_48s = rotl32_S ((c_45s ^ c_40s ^ c_34s ^ c_32s), 1u); + const u32 c_49s = rotl32_S ((c_46s ^ c_41s ^ c_35s ^ c_33s), 1u); + const u32 c_50s = rotl32_S ((c_47s ^ c_42s ^ c_36s ^ c_34s), 1u); + const u32 c_51s = rotl32_S ((c_48s ^ c_43s ^ c_37s ^ c_35s), 1u); + const u32 c_52s = rotl32_S ((c_49s ^ c_44s ^ c_38s ^ c_36s), 1u); + const u32 c_53s = rotl32_S ((c_50s ^ c_45s ^ c_39s ^ c_37s), 1u); + const u32 c_54s = rotl32_S ((c_51s ^ c_46s ^ c_40s ^ c_38s), 1u); + const u32 c_55s = rotl32_S ((c_52s ^ c_47s ^ c_41s ^ c_39s), 1u); + const u32 c_56s = rotl32_S ((c_53s ^ c_48s ^ c_42s ^ c_40s), 1u); + const u32 c_57s = rotl32_S ((c_54s ^ c_49s ^ c_43s ^ c_41s), 1u); + const u32 c_58s = rotl32_S ((c_55s ^ c_50s ^ c_44s ^ c_42s), 1u); + const u32 c_59s = rotl32_S ((c_56s ^ c_51s ^ c_45s ^ c_43s), 1u); + const u32 c_60s = rotl32_S ((c_57s ^ c_52s ^ c_46s ^ c_44s), 1u); + const u32 c_61s = rotl32_S ((c_58s ^ c_53s ^ c_47s ^ c_45s), 1u); + const u32 c_62s = rotl32_S ((c_59s ^ c_54s ^ c_48s ^ c_46s), 1u); + const u32 c_63s = rotl32_S ((c_60s ^ c_55s ^ c_49s ^ c_47s), 1u); + const u32 c_64s = rotl32_S ((c_61s ^ c_56s ^ c_50s ^ c_48s), 1u); + const u32 c_65s = rotl32_S ((c_62s ^ c_57s ^ c_51s ^ c_49s), 1u); + const u32 c_66s = rotl32_S ((c_63s ^ c_58s ^ c_52s ^ c_50s), 1u); + const u32 c_67s = rotl32_S ((c_64s ^ c_59s ^ c_53s ^ c_51s), 1u); + const u32 c_68s = rotl32_S ((c_65s ^ c_60s ^ c_54s ^ c_52s), 1u); + const u32 c_69s = rotl32_S ((c_66s ^ c_61s ^ c_55s ^ c_53s), 1u); + const u32 c_70s = rotl32_S ((c_67s ^ c_62s ^ c_56s ^ c_54s), 1u); + const u32 c_71s = rotl32_S ((c_68s ^ c_63s ^ c_57s ^ c_55s), 1u); + const u32 c_72s = rotl32_S ((c_69s ^ c_64s ^ c_58s ^ c_56s), 1u); + const u32 c_73s = rotl32_S ((c_70s ^ c_65s ^ c_59s ^ c_57s), 1u); + const u32 c_74s = rotl32_S ((c_71s ^ c_66s ^ c_60s ^ c_58s), 1u); + const u32 c_75s = rotl32_S ((c_72s ^ c_67s ^ c_61s ^ c_59s), 1u); const u32 c_17sK = c_17s + SHA1C00; const u32 c_18sK = c_18s + SHA1C00; @@ -530,7 +524,7 @@ static void m00300s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k * reverse */ - const u32 e_rev = rotl32 (search[1], 2u); + const u32 e_rev = rotl32_S (search[1], 2u); /** * loop @@ -538,43 +532,43 @@ static void m00300s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - const u32 w0s01 = rotl32 (w0, 1u); - const u32 w0s02 = rotl32 (w0, 2u); - const u32 w0s03 = rotl32 (w0, 3u); - const u32 w0s04 = rotl32 (w0, 4u); - const u32 w0s05 = rotl32 (w0, 5u); - const u32 w0s06 = rotl32 (w0, 6u); - const u32 w0s07 = rotl32 (w0, 7u); - const u32 w0s08 = rotl32 (w0, 8u); - const u32 w0s09 = rotl32 (w0, 9u); - const u32 w0s10 = rotl32 (w0, 10u); - const u32 w0s11 = rotl32 (w0, 11u); - const u32 w0s12 = rotl32 (w0, 12u); - const u32 w0s13 = rotl32 (w0, 13u); - const u32 w0s14 = rotl32 (w0, 14u); - const u32 w0s15 = rotl32 (w0, 15u); - const u32 w0s16 = rotl32 (w0, 16u); - const u32 w0s17 = rotl32 (w0, 17u); - const u32 w0s18 = rotl32 (w0, 18u); - const u32 w0s19 = rotl32 (w0, 19u); - const u32 w0s20 = rotl32 (w0, 20u); + const u32x w0s01 = rotl32 (w0, 1u); + const u32x w0s02 = rotl32 (w0, 2u); + const u32x w0s03 = rotl32 (w0, 3u); + const u32x w0s04 = rotl32 (w0, 4u); + const u32x w0s05 = rotl32 (w0, 5u); + const u32x w0s06 = rotl32 (w0, 6u); + const u32x w0s07 = rotl32 (w0, 7u); + const u32x w0s08 = rotl32 (w0, 8u); + const u32x w0s09 = rotl32 (w0, 9u); + const u32x w0s10 = rotl32 (w0, 10u); + const u32x w0s11 = rotl32 (w0, 11u); + const u32x w0s12 = rotl32 (w0, 12u); + const u32x w0s13 = rotl32 (w0, 13u); + const u32x w0s14 = rotl32 (w0, 14u); + const u32x w0s15 = rotl32 (w0, 15u); + const u32x w0s16 = rotl32 (w0, 16u); + const u32x w0s17 = rotl32 (w0, 17u); + const u32x w0s18 = rotl32 (w0, 18u); + const u32x w0s19 = rotl32 (w0, 19u); + const u32x w0s20 = rotl32 (w0, 20u); - const u32 w0s04___w0s06 = w0s04 ^ w0s06; - const u32 w0s04___w0s08 = w0s04 ^ w0s08; - const u32 w0s08___w0s12 = w0s08 ^ w0s12; - const u32 w0s04___w0s06___w0s07 = w0s04___w0s06 ^ w0s07; + const u32x w0s04___w0s06 = w0s04 ^ w0s06; + const u32x w0s04___w0s08 = w0s04 ^ w0s08; + const u32x w0s08___w0s12 = w0s08 ^ w0s12; + const u32x w0s04___w0s06___w0s07 = w0s04___w0s06 ^ w0s07; - u32 a = SHA1M_A; - u32 b = SHA1M_B; - u32 c = SHA1M_C; - u32 d = SHA1M_D; - u32 e = SHA1M_E; + u32x a = SHA1M_A; + u32x b = SHA1M_B; + u32x c = SHA1M_C; + u32x d = SHA1M_D; + u32x e = SHA1M_E; #undef K #define K SHA1C00 @@ -669,13 +663,13 @@ static void m00300s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k SHA1_STEP (SHA1_F1 , b, c, d, e, a, (c_74s ^ w0s08 ^ w0s16)); SHA1_STEP (SHA1_F1 , a, b, c, d, e, (c_75s ^ w0s06 ^ w0s12 ^ w0s14)); - const u32 c_76s = rotl32 ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u); - const u32 c_77s = rotl32 ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u); - const u32 c_78s = rotl32 ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u); - const u32 c_79s = rotl32 ((c_76s ^ c_71s ^ c_65s ^ c_63s), 1u); + const u32x c_76s = rotl32 ((c_73s ^ c_68s ^ c_62s ^ c_60s), 1u); + const u32x c_77s = rotl32 ((c_74s ^ c_69s ^ c_63s ^ c_61s), 1u); + const u32x c_78s = rotl32 ((c_75s ^ c_70s ^ c_64s ^ c_62s), 1u); + const u32x c_79s = rotl32 ((c_76s ^ c_71s ^ c_65s ^ c_63s), 1u); - const u32 w0s21 = rotl32 (w0, 21u); - const u32 w0s22 = rotl32 (w0, 22U); + const u32x w0s21 = rotl32 (w0, 21u); + const u32x w0s22 = rotl32 (w0, 22U); SHA1_STEP (SHA1_F1 , e, a, b, c, d, (c_76s ^ w0s07 ^ w0s08___w0s12 ^ w0s16 ^ w0s21)); SHA1_STEP (SHA1_F1 , d, e, a, b, c, (c_77s)); @@ -688,22 +682,22 @@ static void m00300s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k d += SHA1M_D; e += SHA1M_E; - u32 w0_t = a; - u32 w1_t = b; - u32 w2_t = c; - u32 w3_t = d; - u32 w4_t = e; - u32 w5_t = 0x80000000; - u32 w6_t = 0; - u32 w7_t = 0; - u32 w8_t = 0; - u32 w9_t = 0; - u32 wa_t = 0; - u32 wb_t = 0; - u32 wc_t = 0; - u32 wd_t = 0; - u32 we_t = 0; - u32 wf_t = 20 * 8; + u32x w0_t = a; + u32x w1_t = b; + u32x w2_t = c; + u32x w3_t = d; + u32x w4_t = e; + u32x w5_t = 0x80000000; + u32x w6_t = 0; + u32x w7_t = 0; + u32x w8_t = 0; + u32x w9_t = 0; + u32x wa_t = 0; + u32x wb_t = 0; + u32x wc_t = 0; + u32x wd_t = 0; + u32x we_t = 0; + u32x wf_t = 20 * 8; a = SHA1M_A; b = SHA1M_B; @@ -803,26 +797,18 @@ static void m00300s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wa_t); wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, wb_t); - bool q_cond = allx (e_rev != e); - - if (q_cond) continue; + if (MATCHES_NONE_VS (e, e_rev)) continue; wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wc_t); wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wd_t); we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we_t); wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf_t); - - const u32 r0 = d; - const u32 r1 = e; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_S + COMPARE_S_SIMD (d, e, c, b); } } -__kernel void m00300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m00300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -860,7 +846,7 @@ __kernel void m00300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m00300m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m00300_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m00300_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -898,7 +884,7 @@ __kernel void m00300_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m00300m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m00300_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m00300_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -936,7 +922,7 @@ __kernel void m00300_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m00300m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m00300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m00300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -974,7 +960,7 @@ __kernel void m00300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m00300s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m00300_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m00300_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -1012,7 +998,7 @@ __kernel void m00300_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m00300s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m00300_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m00300_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base diff --git a/OpenCL/m00900_a1.cl b/OpenCL/m00900_a1.cl index a9f6251d1..f302ab7da 100644 --- a/OpenCL/m00900_a1.cl +++ b/OpenCL/m00900_a1.cl @@ -70,7 +70,7 @@ __kernel void m00900_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -113,7 +113,7 @@ __kernel void m00900_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -267,7 +267,7 @@ __kernel void m00900_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -322,7 +322,7 @@ __kernel void m00900_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m00900_a3.cl b/OpenCL/m00900_a3.cl index b6a43b85b..e3025cc22 100644 --- a/OpenCL/m00900_a3.cl +++ b/OpenCL/m00900_a3.cl @@ -5,6 +5,8 @@ #define _MD4_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,13 +18,11 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" - -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" +#include "OpenCL/simd.c" #define MD4_STEP_REV(f,a,b,c,d,x,t,s) \ { \ - a = rotr32 (a, s); \ + a = rotr32_S (a, s); \ a -= f (b, c, d); \ a -= x; \ a -= t; \ @@ -30,12 +30,12 @@ #define MD4_STEP_REV1(f,a,b,c,d,x,t,s) \ { \ - a = rotr32 (a, s); \ + a = rotr32_S (a, s); \ a -= x; \ a -= t; \ } -static void m00900m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m00900m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -105,16 +105,16 @@ static void m00900m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - u32 a = MD4M_A; - u32 b = MD4M_B; - u32 c = MD4M_C; - u32 d = MD4M_D; + u32x a = MD4M_A; + u32x b = MD4M_B; + u32x c = MD4M_C; + u32x d = MD4M_D; MD4_STEP (MD4_Fo, a, b, c, d, w0, F_w0c00, MD4S00); MD4_STEP0(MD4_Fo, d, a, b, c, F_w1c00, MD4S01); @@ -167,16 +167,11 @@ static void m00900m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k MD4_STEP0(MD4_H , c, d, a, b, H_w7c02, MD4S22); MD4_STEP0(MD4_H , b, c, d, a, H_wfc02, MD4S23); - const u32 r0 = a; - const u32 r1 = d; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_M + COMPARE_M_SIMD (a, d, c, b); } } -static void m00900s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m00900s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -261,28 +256,28 @@ static void m00900s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k u32 c_rev = digests_buf[digests_offset].digest_buf[2]; u32 d_rev = digests_buf[digests_offset].digest_buf[3]; - MD4_STEP_REV (MD4_H, b_rev, c_rev, d_rev, a_rev, w[15], MD4C02, MD4S23); - MD4_STEP_REV (MD4_H, c_rev, d_rev, a_rev, b_rev, w[ 7], MD4C02, MD4S22); - MD4_STEP_REV (MD4_H, d_rev, a_rev, b_rev, c_rev, w[11], MD4C02, MD4S21); - MD4_STEP_REV (MD4_H, a_rev, b_rev, c_rev, d_rev, w[ 3], MD4C02, MD4S20); - MD4_STEP_REV (MD4_H, b_rev, c_rev, d_rev, a_rev, w[13], MD4C02, MD4S23); - MD4_STEP_REV (MD4_H, c_rev, d_rev, a_rev, b_rev, w[ 5], MD4C02, MD4S22); - MD4_STEP_REV (MD4_H, d_rev, a_rev, b_rev, c_rev, w[ 9], MD4C02, MD4S21); - MD4_STEP_REV (MD4_H, a_rev, b_rev, c_rev, d_rev, w[ 1], MD4C02, MD4S20); - MD4_STEP_REV (MD4_H, b_rev, c_rev, d_rev, a_rev, w[14], MD4C02, MD4S23); - MD4_STEP_REV (MD4_H, c_rev, d_rev, a_rev, b_rev, w[ 6], MD4C02, MD4S22); - MD4_STEP_REV (MD4_H, d_rev, a_rev, b_rev, c_rev, w[10], MD4C02, MD4S21); - MD4_STEP_REV (MD4_H, a_rev, b_rev, c_rev, d_rev, w[ 2], MD4C02, MD4S20); - MD4_STEP_REV (MD4_H, b_rev, c_rev, d_rev, a_rev, w[12], MD4C02, MD4S23); - MD4_STEP_REV (MD4_H, c_rev, d_rev, a_rev, b_rev, w[ 4], MD4C02, MD4S22); - MD4_STEP_REV (MD4_H, d_rev, a_rev, b_rev, c_rev, w[ 8], MD4C02, MD4S21); - MD4_STEP_REV (MD4_H, a_rev, b_rev, c_rev, d_rev, 0, MD4C02, MD4S20); + MD4_STEP_REV (MD4_H_S, b_rev, c_rev, d_rev, a_rev, w[15], MD4C02, MD4S23); + MD4_STEP_REV (MD4_H_S, c_rev, d_rev, a_rev, b_rev, w[ 7], MD4C02, MD4S22); + MD4_STEP_REV (MD4_H_S, d_rev, a_rev, b_rev, c_rev, w[11], MD4C02, MD4S21); + MD4_STEP_REV (MD4_H_S, a_rev, b_rev, c_rev, d_rev, w[ 3], MD4C02, MD4S20); + MD4_STEP_REV (MD4_H_S, b_rev, c_rev, d_rev, a_rev, w[13], MD4C02, MD4S23); + MD4_STEP_REV (MD4_H_S, c_rev, d_rev, a_rev, b_rev, w[ 5], MD4C02, MD4S22); + MD4_STEP_REV (MD4_H_S, d_rev, a_rev, b_rev, c_rev, w[ 9], MD4C02, MD4S21); + MD4_STEP_REV (MD4_H_S, a_rev, b_rev, c_rev, d_rev, w[ 1], MD4C02, MD4S20); + MD4_STEP_REV (MD4_H_S, b_rev, c_rev, d_rev, a_rev, w[14], MD4C02, MD4S23); + MD4_STEP_REV (MD4_H_S, c_rev, d_rev, a_rev, b_rev, w[ 6], MD4C02, MD4S22); + MD4_STEP_REV (MD4_H_S, d_rev, a_rev, b_rev, c_rev, w[10], MD4C02, MD4S21); + MD4_STEP_REV (MD4_H_S, a_rev, b_rev, c_rev, d_rev, w[ 2], MD4C02, MD4S20); + MD4_STEP_REV (MD4_H_S, b_rev, c_rev, d_rev, a_rev, w[12], MD4C02, MD4S23); + MD4_STEP_REV (MD4_H_S, c_rev, d_rev, a_rev, b_rev, w[ 4], MD4C02, MD4S22); + MD4_STEP_REV (MD4_H_S, d_rev, a_rev, b_rev, c_rev, w[ 8], MD4C02, MD4S21); + MD4_STEP_REV (MD4_H_S, a_rev, b_rev, c_rev, d_rev, 0, MD4C02, MD4S20); const u32 sav_c = c_rev; const u32 sav_d = d_rev; - MD4_STEP_REV1(MD4_G, b_rev, c_rev, d_rev, a_rev, w[15], MD4C01, MD4S13); - MD4_STEP_REV1(MD4_G, c_rev, d_rev, a_rev, b_rev, w[11], MD4C01, MD4S12); + MD4_STEP_REV1(MD4_G_S, b_rev, c_rev, d_rev, a_rev, w[15], MD4C01, MD4S13); + MD4_STEP_REV1(MD4_G_S, c_rev, d_rev, a_rev, b_rev, w[11], MD4C01, MD4S12); /** * loop @@ -290,24 +285,24 @@ static void m00900s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - u32 pre_a = a_rev; - u32 pre_b = b_rev; - u32 pre_c = c_rev; + u32x pre_a = a_rev; + u32x pre_b = b_rev; + u32x pre_c = c_rev; pre_a = pre_a - w0; pre_b = pre_b - MD4_G (sav_c, sav_d, pre_a); pre_c = pre_c - MD4_G (sav_d, pre_a, pre_b); - u32 a = MD4M_A; - u32 b = MD4M_B; - u32 c = MD4M_C; - u32 d = MD4M_D; + u32x a = MD4M_A; + u32x b = MD4M_B; + u32x c = MD4M_C; + u32x d = MD4M_D; MD4_STEP (MD4_Fo, a, b, c, d, w0, F_w0c00, MD4S00); MD4_STEP0(MD4_Fo, d, a, b, c, F_w1c00, MD4S01); @@ -338,16 +333,12 @@ static void m00900s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k MD4_STEP0(MD4_Go, d, a, b, c, G_w6c01, MD4S11); MD4_STEP0(MD4_Go, c, d, a, b, G_wac01, MD4S12); - bool q_cond = allx (pre_c != c); - - if (q_cond) continue; + if (MATCHES_NONE_VV (c, pre_c)) continue; MD4_STEP0(MD4_Go, b, c, d, a, G_wec01, MD4S13); MD4_STEP0(MD4_Go, a, b, c, d, G_w3c01, MD4S10); - bool q_cond2 = allx (pre_a != a); - - if (q_cond2) continue; + if (MATCHES_NONE_VV (a, pre_a)) continue; MD4_STEP0(MD4_Go, d, a, b, c, G_w7c01, MD4S11); MD4_STEP0(MD4_Go, c, d, a, b, G_wbc01, MD4S12); @@ -370,16 +361,11 @@ static void m00900s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k MD4_STEP0(MD4_H , c, d, a, b, H_w7c02, MD4S22); MD4_STEP0(MD4_H , b, c, d, a, H_wfc02, MD4S23); - const u32 r0 = a; - const u32 r1 = d; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_S + COMPARE_S_SIMD (a, d, c, b); } } -__kernel void m00900_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m00900_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -417,7 +403,7 @@ __kernel void m00900_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m00900m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m00900_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m00900_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -455,7 +441,7 @@ __kernel void m00900_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m00900m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m00900_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m00900_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -493,7 +479,7 @@ __kernel void m00900_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m00900m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m00900_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m00900_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -531,7 +517,7 @@ __kernel void m00900_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m00900s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m00900_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m00900_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -569,7 +555,7 @@ __kernel void m00900_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m00900s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m00900_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m00900_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base diff --git a/OpenCL/m01000_a1.cl b/OpenCL/m01000_a1.cl index b38785a80..f9455fdfa 100644 --- a/OpenCL/m01000_a1.cl +++ b/OpenCL/m01000_a1.cl @@ -70,7 +70,7 @@ __kernel void m01000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -113,7 +113,7 @@ __kernel void m01000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -279,7 +279,7 @@ __kernel void m01000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -334,7 +334,7 @@ __kernel void m01000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m01000_a3.cl b/OpenCL/m01000_a3.cl index 569313bf5..678b43f3b 100644 --- a/OpenCL/m01000_a3.cl +++ b/OpenCL/m01000_a3.cl @@ -22,7 +22,7 @@ #define MD4_STEP_REV(f,a,b,c,d,x,t,s) \ { \ - a = rotr32 (a, s); \ + a = rotr32_S (a, s); \ a -= f (b, c, d); \ a -= x; \ a -= t; \ @@ -30,7 +30,7 @@ #define MD4_STEP_REV1(f,a,b,c,d,x,t,s) \ { \ - a = rotr32 (a, s); \ + a = rotr32_S (a, s); \ a -= x; \ a -= t; \ } @@ -111,8 +111,6 @@ static void m01000m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k const u32x w0 = w0l | w0r; - u32x tmp2; - u32x a = MD4M_A; u32x b = MD4M_B; u32x c = MD4M_C; @@ -152,22 +150,22 @@ static void m01000m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k MD4_STEP0(MD4_Go, c, d, a, b, G_wbc01, MD4S12); MD4_STEP0(MD4_Go, b, c, d, a, G_wfc01, MD4S13); - MD4_STEP (MD4_H1, a, b, c, d, w0, H_w0c02, MD4S20); - MD4_STEP0(MD4_H2, d, a, b, c, H_w8c02, MD4S21); - MD4_STEP0(MD4_H1, c, d, a, b, H_w4c02, MD4S22); - MD4_STEP0(MD4_H2, b, c, d, a, H_wcc02, MD4S23); - MD4_STEP0(MD4_H1, a, b, c, d, H_w2c02, MD4S20); - MD4_STEP0(MD4_H2, d, a, b, c, H_wac02, MD4S21); - MD4_STEP0(MD4_H1, c, d, a, b, H_w6c02, MD4S22); - MD4_STEP0(MD4_H2, b, c, d, a, H_wec02, MD4S23); - MD4_STEP0(MD4_H1, a, b, c, d, H_w1c02, MD4S20); - MD4_STEP0(MD4_H2, d, a, b, c, H_w9c02, MD4S21); - MD4_STEP0(MD4_H1, c, d, a, b, H_w5c02, MD4S22); - MD4_STEP0(MD4_H2, b, c, d, a, H_wdc02, MD4S23); - MD4_STEP0(MD4_H1, a, b, c, d, H_w3c02, MD4S20); - MD4_STEP0(MD4_H2, d, a, b, c, H_wbc02, MD4S21); - MD4_STEP0(MD4_H1, c, d, a, b, H_w7c02, MD4S22); - MD4_STEP0(MD4_H2, b, c, d, a, H_wfc02, MD4S23); + MD4_STEP (MD4_H , a, b, c, d, w0, H_w0c02, MD4S20); + MD4_STEP0(MD4_H , d, a, b, c, H_w8c02, MD4S21); + MD4_STEP0(MD4_H , c, d, a, b, H_w4c02, MD4S22); + MD4_STEP0(MD4_H , b, c, d, a, H_wcc02, MD4S23); + MD4_STEP0(MD4_H , a, b, c, d, H_w2c02, MD4S20); + MD4_STEP0(MD4_H , d, a, b, c, H_wac02, MD4S21); + MD4_STEP0(MD4_H , c, d, a, b, H_w6c02, MD4S22); + MD4_STEP0(MD4_H , b, c, d, a, H_wec02, MD4S23); + MD4_STEP0(MD4_H , a, b, c, d, H_w1c02, MD4S20); + MD4_STEP0(MD4_H , d, a, b, c, H_w9c02, MD4S21); + MD4_STEP0(MD4_H , c, d, a, b, H_w5c02, MD4S22); + MD4_STEP0(MD4_H , b, c, d, a, H_wdc02, MD4S23); + MD4_STEP0(MD4_H , a, b, c, d, H_w3c02, MD4S20); + MD4_STEP0(MD4_H , d, a, b, c, H_wbc02, MD4S21); + MD4_STEP0(MD4_H , c, d, a, b, H_w7c02, MD4S22); + MD4_STEP0(MD4_H , b, c, d, a, H_wfc02, MD4S23); COMPARE_M_SIMD (a, d, c, b); } @@ -253,33 +251,33 @@ static void m01000s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k * reverse */ - u32x a_rev = digests_buf[digests_offset].digest_buf[0]; - u32x b_rev = digests_buf[digests_offset].digest_buf[1]; - u32x c_rev = digests_buf[digests_offset].digest_buf[2]; - u32x d_rev = digests_buf[digests_offset].digest_buf[3]; + u32 a_rev = digests_buf[digests_offset].digest_buf[0]; + u32 b_rev = digests_buf[digests_offset].digest_buf[1]; + u32 c_rev = digests_buf[digests_offset].digest_buf[2]; + u32 d_rev = digests_buf[digests_offset].digest_buf[3]; - MD4_STEP_REV (MD4_H, b_rev, c_rev, d_rev, a_rev, w[15], MD4C02, MD4S23); - MD4_STEP_REV (MD4_H, c_rev, d_rev, a_rev, b_rev, w[ 7], MD4C02, MD4S22); - MD4_STEP_REV (MD4_H, d_rev, a_rev, b_rev, c_rev, w[11], MD4C02, MD4S21); - MD4_STEP_REV (MD4_H, a_rev, b_rev, c_rev, d_rev, w[ 3], MD4C02, MD4S20); - MD4_STEP_REV (MD4_H, b_rev, c_rev, d_rev, a_rev, w[13], MD4C02, MD4S23); - MD4_STEP_REV (MD4_H, c_rev, d_rev, a_rev, b_rev, w[ 5], MD4C02, MD4S22); - MD4_STEP_REV (MD4_H, d_rev, a_rev, b_rev, c_rev, w[ 9], MD4C02, MD4S21); - MD4_STEP_REV (MD4_H, a_rev, b_rev, c_rev, d_rev, w[ 1], MD4C02, MD4S20); - MD4_STEP_REV (MD4_H, b_rev, c_rev, d_rev, a_rev, w[14], MD4C02, MD4S23); - MD4_STEP_REV (MD4_H, c_rev, d_rev, a_rev, b_rev, w[ 6], MD4C02, MD4S22); - MD4_STEP_REV (MD4_H, d_rev, a_rev, b_rev, c_rev, w[10], MD4C02, MD4S21); - MD4_STEP_REV (MD4_H, a_rev, b_rev, c_rev, d_rev, w[ 2], MD4C02, MD4S20); - MD4_STEP_REV (MD4_H, b_rev, c_rev, d_rev, a_rev, w[12], MD4C02, MD4S23); - MD4_STEP_REV (MD4_H, c_rev, d_rev, a_rev, b_rev, w[ 4], MD4C02, MD4S22); - MD4_STEP_REV (MD4_H, d_rev, a_rev, b_rev, c_rev, w[ 8], MD4C02, MD4S21); - MD4_STEP_REV (MD4_H, a_rev, b_rev, c_rev, d_rev, 0, MD4C02, MD4S20); + MD4_STEP_REV (MD4_H_S, b_rev, c_rev, d_rev, a_rev, w[15], MD4C02, MD4S23); + MD4_STEP_REV (MD4_H_S, c_rev, d_rev, a_rev, b_rev, w[ 7], MD4C02, MD4S22); + MD4_STEP_REV (MD4_H_S, d_rev, a_rev, b_rev, c_rev, w[11], MD4C02, MD4S21); + MD4_STEP_REV (MD4_H_S, a_rev, b_rev, c_rev, d_rev, w[ 3], MD4C02, MD4S20); + MD4_STEP_REV (MD4_H_S, b_rev, c_rev, d_rev, a_rev, w[13], MD4C02, MD4S23); + MD4_STEP_REV (MD4_H_S, c_rev, d_rev, a_rev, b_rev, w[ 5], MD4C02, MD4S22); + MD4_STEP_REV (MD4_H_S, d_rev, a_rev, b_rev, c_rev, w[ 9], MD4C02, MD4S21); + MD4_STEP_REV (MD4_H_S, a_rev, b_rev, c_rev, d_rev, w[ 1], MD4C02, MD4S20); + MD4_STEP_REV (MD4_H_S, b_rev, c_rev, d_rev, a_rev, w[14], MD4C02, MD4S23); + MD4_STEP_REV (MD4_H_S, c_rev, d_rev, a_rev, b_rev, w[ 6], MD4C02, MD4S22); + MD4_STEP_REV (MD4_H_S, d_rev, a_rev, b_rev, c_rev, w[10], MD4C02, MD4S21); + MD4_STEP_REV (MD4_H_S, a_rev, b_rev, c_rev, d_rev, w[ 2], MD4C02, MD4S20); + MD4_STEP_REV (MD4_H_S, b_rev, c_rev, d_rev, a_rev, w[12], MD4C02, MD4S23); + MD4_STEP_REV (MD4_H_S, c_rev, d_rev, a_rev, b_rev, w[ 4], MD4C02, MD4S22); + MD4_STEP_REV (MD4_H_S, d_rev, a_rev, b_rev, c_rev, w[ 8], MD4C02, MD4S21); + MD4_STEP_REV (MD4_H_S, a_rev, b_rev, c_rev, d_rev, 0, MD4C02, MD4S20); - const u32x sav_c = c_rev; - const u32x sav_d = d_rev; + const u32 sav_c = c_rev; + const u32 sav_d = d_rev; - MD4_STEP_REV1(MD4_G, b_rev, c_rev, d_rev, a_rev, w[15], MD4C01, MD4S13); - MD4_STEP_REV1(MD4_G, c_rev, d_rev, a_rev, b_rev, w[11], MD4C01, MD4S12); + MD4_STEP_REV1(MD4_G_S, b_rev, c_rev, d_rev, a_rev, w[15], MD4C01, MD4S13); + MD4_STEP_REV1(MD4_G_S, c_rev, d_rev, a_rev, b_rev, w[11], MD4C01, MD4S12); /** * loop @@ -301,8 +299,6 @@ static void m01000s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k pre_b = pre_b - MD4_G (sav_c, sav_d, pre_a); pre_c = pre_c - MD4_G (sav_d, pre_a, pre_b); - u32x tmp2; - u32x a = MD4M_A; u32x b = MD4M_B; u32x c = MD4M_C; @@ -337,33 +333,33 @@ static void m01000s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k MD4_STEP0(MD4_Go, d, a, b, c, G_w6c01, MD4S11); MD4_STEP0(MD4_Go, c, d, a, b, G_wac01, MD4S12); - if (MATCHES_NONE_VV (pre_c, c)) continue; + if (MATCHES_NONE_VV (c, pre_c)) continue; MD4_STEP0(MD4_Go, b, c, d, a, G_wec01, MD4S13); MD4_STEP0(MD4_Go, a, b, c, d, G_w3c01, MD4S10); - if (MATCHES_NONE_VV (pre_a, a)) continue; + if (MATCHES_NONE_VV (a, pre_a)) continue; MD4_STEP0(MD4_Go, d, a, b, c, G_w7c01, MD4S11); MD4_STEP0(MD4_Go, c, d, a, b, G_wbc01, MD4S12); MD4_STEP0(MD4_Go, b, c, d, a, G_wfc01, MD4S13); - MD4_STEP (MD4_H1, a, b, c, d, w0, H_w0c02, MD4S20); - MD4_STEP0(MD4_H2, d, a, b, c, H_w8c02, MD4S21); - MD4_STEP0(MD4_H1, c, d, a, b, H_w4c02, MD4S22); - MD4_STEP0(MD4_H2, b, c, d, a, H_wcc02, MD4S23); - MD4_STEP0(MD4_H1, a, b, c, d, H_w2c02, MD4S20); - MD4_STEP0(MD4_H2, d, a, b, c, H_wac02, MD4S21); - MD4_STEP0(MD4_H1, c, d, a, b, H_w6c02, MD4S22); - MD4_STEP0(MD4_H2, b, c, d, a, H_wec02, MD4S23); - MD4_STEP0(MD4_H1, a, b, c, d, H_w1c02, MD4S20); - MD4_STEP0(MD4_H2, d, a, b, c, H_w9c02, MD4S21); - MD4_STEP0(MD4_H1, c, d, a, b, H_w5c02, MD4S22); - MD4_STEP0(MD4_H2, b, c, d, a, H_wdc02, MD4S23); - MD4_STEP0(MD4_H1, a, b, c, d, H_w3c02, MD4S20); - MD4_STEP0(MD4_H2, d, a, b, c, H_wbc02, MD4S21); - MD4_STEP0(MD4_H1, c, d, a, b, H_w7c02, MD4S22); - MD4_STEP0(MD4_H2, b, c, d, a, H_wfc02, MD4S23); + MD4_STEP (MD4_H , a, b, c, d, w0, H_w0c02, MD4S20); + MD4_STEP0(MD4_H , d, a, b, c, H_w8c02, MD4S21); + MD4_STEP0(MD4_H , c, d, a, b, H_w4c02, MD4S22); + MD4_STEP0(MD4_H , b, c, d, a, H_wcc02, MD4S23); + MD4_STEP0(MD4_H , a, b, c, d, H_w2c02, MD4S20); + MD4_STEP0(MD4_H , d, a, b, c, H_wac02, MD4S21); + MD4_STEP0(MD4_H , c, d, a, b, H_w6c02, MD4S22); + MD4_STEP0(MD4_H , b, c, d, a, H_wec02, MD4S23); + MD4_STEP0(MD4_H , a, b, c, d, H_w1c02, MD4S20); + MD4_STEP0(MD4_H , d, a, b, c, H_w9c02, MD4S21); + MD4_STEP0(MD4_H , c, d, a, b, H_w5c02, MD4S22); + MD4_STEP0(MD4_H , b, c, d, a, H_wdc02, MD4S23); + MD4_STEP0(MD4_H , a, b, c, d, H_w3c02, MD4S20); + MD4_STEP0(MD4_H , d, a, b, c, H_wbc02, MD4S21); + MD4_STEP0(MD4_H , c, d, a, b, H_w7c02, MD4S22); + MD4_STEP0(MD4_H , b, c, d, a, H_wfc02, MD4S23); COMPARE_S_SIMD (a, d, c, b); } diff --git a/OpenCL/m01100_a1.cl b/OpenCL/m01100_a1.cl index d15711bce..0a77ff3a3 100644 --- a/OpenCL/m01100_a1.cl +++ b/OpenCL/m01100_a1.cl @@ -70,7 +70,7 @@ __kernel void m01100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -140,7 +140,7 @@ __kernel void m01100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -382,7 +382,7 @@ __kernel void m01100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -464,7 +464,7 @@ __kernel void m01100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m01100_a3.cl b/OpenCL/m01100_a3.cl index 4e7fcc7ea..6d2695590 100644 --- a/OpenCL/m01100_a3.cl +++ b/OpenCL/m01100_a3.cl @@ -5,6 +5,8 @@ #define _MD4_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,11 +18,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" +#include "OpenCL/simd.c" -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" - -static void m01100m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m01100m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -117,16 +117,16 @@ static void m01100m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - u32 a = MD4M_A; - u32 b = MD4M_B; - u32 c = MD4M_C; - u32 d = MD4M_D; + u32x a = MD4M_A; + u32x b = MD4M_B; + u32x c = MD4M_C; + u32x d = MD4M_D; MD4_STEP (MD4_Fo, a, b, c, d, w0, F_w0c00, MD4S00); MD4_STEP0(MD4_Fo, d, a, b, c, F_w1c00, MD4S01); @@ -184,10 +184,10 @@ static void m01100m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k c += MD4M_C; d += MD4M_D; - u32 w0_t[4]; - u32 w1_t[4]; - u32 w2_t[4]; - u32 w3_t[4]; + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; w0_t[0] = a; w0_t[1] = b; @@ -262,16 +262,11 @@ static void m01100m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k MD4_STEP (MD4_H , c, d, a, b, w1_t[3], MD4C02, MD4S22); MD4_STEP (MD4_H , b, c, d, a, w3_t[3], MD4C02, MD4S23); - const u32 r0 = a; - const u32 r1 = d; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_M + COMPARE_M_SIMD (a, d, c, b); } } -static void m01100s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m01100s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -380,16 +375,16 @@ static void m01100s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - u32 a = MD4M_A; - u32 b = MD4M_B; - u32 c = MD4M_C; - u32 d = MD4M_D; + u32x a = MD4M_A; + u32x b = MD4M_B; + u32x c = MD4M_C; + u32x d = MD4M_D; MD4_STEP (MD4_Fo, a, b, c, d, w0, F_w0c00, MD4S00); MD4_STEP0(MD4_Fo, d, a, b, c, F_w1c00, MD4S01); @@ -447,10 +442,10 @@ static void m01100s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k c += MD4M_C; d += MD4M_D; - u32 w0_t[4]; - u32 w1_t[4]; - u32 w2_t[4]; - u32 w3_t[4]; + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; w0_t[0] = a; w0_t[1] = b; @@ -522,24 +517,17 @@ static void m01100s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k MD4_STEP (MD4_H , b, c, d, a, w3_t[1], MD4C02, MD4S23); MD4_STEP (MD4_H , a, b, c, d, w0_t[3], MD4C02, MD4S20); - bool q_cond = allx (search[0] != a); - - if (q_cond) continue; + if (MATCHES_NONE_VS (a, search[0])) continue; MD4_STEP (MD4_H , d, a, b, c, w2_t[3], MD4C02, MD4S21); MD4_STEP (MD4_H , c, d, a, b, w1_t[3], MD4C02, MD4S22); MD4_STEP (MD4_H , b, c, d, a, w3_t[3], MD4C02, MD4S23); - const u32 r0 = a; - const u32 r1 = d; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_S + COMPARE_S_SIMD (a, d, c, b); } } -__kernel void m01100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m01100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -577,7 +565,7 @@ __kernel void m01100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m01100m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m01100_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m01100_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -615,11 +603,11 @@ __kernel void m01100_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m01100m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m01100_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m01100_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { } -__kernel void m01100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m01100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -657,7 +645,7 @@ __kernel void m01100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m01100s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m01100_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m01100_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -695,6 +683,6 @@ __kernel void m01100_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m01100s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m01100_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m01100_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { } diff --git a/OpenCL/m01400_a1.cl b/OpenCL/m01400_a1.cl index 1224fb691..1354ab4b0 100644 --- a/OpenCL/m01400_a1.cl +++ b/OpenCL/m01400_a1.cl @@ -70,7 +70,7 @@ __kernel void m01400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -115,7 +115,7 @@ __kernel void m01400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordr0, wordr1, pw_r_len); - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -306,7 +306,7 @@ __kernel void m01400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -363,7 +363,7 @@ __kernel void m01400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordr0, wordr1, pw_r_len); - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m01400_a3.cl b/OpenCL/m01400_a3.cl index 22d578a0b..6f7228ce2 100644 --- a/OpenCL/m01400_a3.cl +++ b/OpenCL/m01400_a3.cl @@ -5,6 +5,8 @@ #define _SHA256_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,11 +18,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" +#include "OpenCL/simd.c" -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" - -static void m01400m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m01400m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -35,37 +35,37 @@ static void m01400m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - u32 w0_t = w0; - u32 w1_t = w[ 1]; - u32 w2_t = w[ 2]; - u32 w3_t = w[ 3]; - u32 w4_t = w[ 4]; - u32 w5_t = w[ 5]; - u32 w6_t = w[ 6]; - u32 w7_t = w[ 7]; - u32 w8_t = w[ 8]; - u32 w9_t = w[ 9]; - u32 wa_t = w[10]; - u32 wb_t = w[11]; - u32 wc_t = w[12]; - u32 wd_t = w[13]; - u32 we_t = w[14]; - u32 wf_t = w[15]; + u32x w0_t = w0; + u32x w1_t = w[ 1]; + u32x w2_t = w[ 2]; + u32x w3_t = w[ 3]; + u32x w4_t = w[ 4]; + u32x w5_t = w[ 5]; + u32x w6_t = w[ 6]; + u32x w7_t = w[ 7]; + u32x w8_t = w[ 8]; + u32x w9_t = w[ 9]; + u32x wa_t = w[10]; + u32x wb_t = w[11]; + u32x wc_t = w[12]; + u32x wd_t = w[13]; + u32x we_t = w[14]; + u32x wf_t = w[15]; - u32 a = SHA256M_A; - u32 b = SHA256M_B; - u32 c = SHA256M_C; - u32 d = SHA256M_D; - u32 e = SHA256M_E; - u32 f = SHA256M_F; - u32 g = SHA256M_G; - u32 h = SHA256M_H; + u32x a = SHA256M_A; + u32x b = SHA256M_B; + u32x c = SHA256M_C; + u32x d = SHA256M_D; + u32x e = SHA256M_E; + u32x f = SHA256M_F; + u32x g = SHA256M_G; + u32x h = SHA256M_H; SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01); @@ -135,17 +135,11 @@ static void m01400m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e); wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f); - - const u32 r0 = d; - const u32 r1 = h; - const u32 r2 = c; - const u32 r3 = g; - - #include COMPARE_M + COMPARE_M_SIMD (d, h, c, g); } } -static void m01400s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m01400s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -172,37 +166,37 @@ static void m01400s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - u32 w0_t = w0; - u32 w1_t = w[ 1]; - u32 w2_t = w[ 2]; - u32 w3_t = w[ 3]; - u32 w4_t = w[ 4]; - u32 w5_t = w[ 5]; - u32 w6_t = w[ 6]; - u32 w7_t = w[ 7]; - u32 w8_t = w[ 8]; - u32 w9_t = w[ 9]; - u32 wa_t = w[10]; - u32 wb_t = w[11]; - u32 wc_t = w[12]; - u32 wd_t = w[13]; - u32 we_t = w[14]; - u32 wf_t = w[15]; + u32x w0_t = w0; + u32x w1_t = w[ 1]; + u32x w2_t = w[ 2]; + u32x w3_t = w[ 3]; + u32x w4_t = w[ 4]; + u32x w5_t = w[ 5]; + u32x w6_t = w[ 6]; + u32x w7_t = w[ 7]; + u32x w8_t = w[ 8]; + u32x w9_t = w[ 9]; + u32x wa_t = w[10]; + u32x wb_t = w[11]; + u32x wc_t = w[12]; + u32x wd_t = w[13]; + u32x we_t = w[14]; + u32x wf_t = w[15]; - u32 a = SHA256M_A; - u32 b = SHA256M_B; - u32 c = SHA256M_C; - u32 d = SHA256M_D; - u32 e = SHA256M_E; - u32 f = SHA256M_F; - u32 g = SHA256M_G; - u32 h = SHA256M_H; + u32x a = SHA256M_A; + u32x b = SHA256M_B; + u32x c = SHA256M_C; + u32x d = SHA256M_D; + u32x e = SHA256M_E; + u32x f = SHA256M_F; + u32x g = SHA256M_G; + u32x h = SHA256M_H; SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01); @@ -268,20 +262,18 @@ static void m01400s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k wa_t = SHA256_EXPAND (w8_t, w3_t, wb_t, wa_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, wa_t, SHA256C3a); wb_t = SHA256_EXPAND (w9_t, w4_t, wc_t, wb_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, wb_t, SHA256C3b); wc_t = SHA256_EXPAND (wa_t, w5_t, wd_t, wc_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, wc_t, SHA256C3c); + + if (MATCHES_NONE_VS (d, search[0])) continue; + wd_t = SHA256_EXPAND (wb_t, w6_t, we_t, wd_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, wd_t, SHA256C3d); we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e); wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f); - const u32 r0 = d; - const u32 r1 = h; - const u32 r2 = c; - const u32 r3 = g; - - #include COMPARE_S + COMPARE_S_SIMD (d, h, c, g); } } -__kernel void m01400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m01400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -319,7 +311,7 @@ __kernel void m01400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m01400m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m01400_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m01400_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -357,7 +349,7 @@ __kernel void m01400_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m01400m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m01400_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m01400_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -395,7 +387,7 @@ __kernel void m01400_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m01400m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m01400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m01400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -433,7 +425,7 @@ __kernel void m01400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m01400s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m01400_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m01400_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -471,7 +463,7 @@ __kernel void m01400_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m01400s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m01400_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m01400_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base diff --git a/OpenCL/m01410_a0.cl b/OpenCL/m01410_a0.cl index e34acb39d..8e7d2eb5b 100644 --- a/OpenCL/m01410_a0.cl +++ b/OpenCL/m01410_a0.cl @@ -142,7 +142,7 @@ __kernel void m01410_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, out_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, out_len); const u32 out_salt_len = out_len + salt_len; @@ -416,7 +416,7 @@ __kernel void m01410_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, out_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, out_len); const u32 out_salt_len = out_len + salt_len; diff --git a/OpenCL/m01410_a1.cl b/OpenCL/m01410_a1.cl index 2a8e52b73..7ed522213 100644 --- a/OpenCL/m01410_a1.cl +++ b/OpenCL/m01410_a1.cl @@ -68,7 +68,7 @@ __kernel void m01410_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -131,7 +131,7 @@ __kernel void m01410_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } /** @@ -166,7 +166,7 @@ __kernel void m01410_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, pw_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len); const u32 pw_salt_len = pw_len + salt_len; @@ -358,7 +358,7 @@ __kernel void m01410_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -433,7 +433,7 @@ __kernel void m01410_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } /** @@ -468,7 +468,7 @@ __kernel void m01410_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, pw_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len); const u32 pw_salt_len = pw_len + salt_len; diff --git a/OpenCL/m01410_a3.cl b/OpenCL/m01410_a3.cl index 890713ffe..49f8ed819 100644 --- a/OpenCL/m01410_a3.cl +++ b/OpenCL/m01410_a3.cl @@ -5,6 +5,8 @@ #define _SHA256_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,11 +18,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" +#include "OpenCL/simd.c" -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" - -static void m01410m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m01410m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -61,24 +61,24 @@ static void m01410m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k salt_buf3[2] = 0; salt_buf3[3] = 0; - switch_buffer_by_offset (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len); + switch_buffer_by_offset_le_S (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len); - w[ 0] |= swap32 (salt_buf0[0]); - w[ 1] |= swap32 (salt_buf0[1]); - w[ 2] |= swap32 (salt_buf0[2]); - w[ 3] |= swap32 (salt_buf0[3]); - w[ 4] |= swap32 (salt_buf1[0]); - w[ 5] |= swap32 (salt_buf1[1]); - w[ 6] |= swap32 (salt_buf1[2]); - w[ 7] |= swap32 (salt_buf1[3]); - w[ 8] |= swap32 (salt_buf2[0]); - w[ 9] |= swap32 (salt_buf2[1]); - w[10] |= swap32 (salt_buf2[2]); - w[11] |= swap32 (salt_buf2[3]); - w[12] |= swap32 (salt_buf3[0]); - w[13] |= swap32 (salt_buf3[1]); - w[14] |= swap32 (salt_buf3[2]); - w[15] |= swap32 (salt_buf3[3]); + w[ 0] |= swap32_S (salt_buf0[0]); + w[ 1] |= swap32_S (salt_buf0[1]); + w[ 2] |= swap32_S (salt_buf0[2]); + w[ 3] |= swap32_S (salt_buf0[3]); + w[ 4] |= swap32_S (salt_buf1[0]); + w[ 5] |= swap32_S (salt_buf1[1]); + w[ 6] |= swap32_S (salt_buf1[2]); + w[ 7] |= swap32_S (salt_buf1[3]); + w[ 8] |= swap32_S (salt_buf2[0]); + w[ 9] |= swap32_S (salt_buf2[1]); + w[10] |= swap32_S (salt_buf2[2]); + w[11] |= swap32_S (salt_buf2[3]); + w[12] |= swap32_S (salt_buf3[0]); + w[13] |= swap32_S (salt_buf3[1]); + w[14] |= swap32_S (salt_buf3[2]); + w[15] |= swap32_S (salt_buf3[3]); const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -92,37 +92,37 @@ static void m01410m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - u32 w0_t = w0; - u32 w1_t = w[ 1]; - u32 w2_t = w[ 2]; - u32 w3_t = w[ 3]; - u32 w4_t = w[ 4]; - u32 w5_t = w[ 5]; - u32 w6_t = w[ 6]; - u32 w7_t = w[ 7]; - u32 w8_t = w[ 8]; - u32 w9_t = w[ 9]; - u32 wa_t = w[10]; - u32 wb_t = w[11]; - u32 wc_t = w[12]; - u32 wd_t = w[13]; - u32 we_t = w[14]; - u32 wf_t = w[15]; + u32x w0_t = w0; + u32x w1_t = w[ 1]; + u32x w2_t = w[ 2]; + u32x w3_t = w[ 3]; + u32x w4_t = w[ 4]; + u32x w5_t = w[ 5]; + u32x w6_t = w[ 6]; + u32x w7_t = w[ 7]; + u32x w8_t = w[ 8]; + u32x w9_t = w[ 9]; + u32x wa_t = w[10]; + u32x wb_t = w[11]; + u32x wc_t = w[12]; + u32x wd_t = w[13]; + u32x we_t = w[14]; + u32x wf_t = w[15]; - u32 a = SHA256M_A; - u32 b = SHA256M_B; - u32 c = SHA256M_C; - u32 d = SHA256M_D; - u32 e = SHA256M_E; - u32 f = SHA256M_F; - u32 g = SHA256M_G; - u32 h = SHA256M_H; + u32x a = SHA256M_A; + u32x b = SHA256M_B; + u32x c = SHA256M_C; + u32x d = SHA256M_D; + u32x e = SHA256M_E; + u32x f = SHA256M_F; + u32x g = SHA256M_G; + u32x h = SHA256M_H; SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01); @@ -192,17 +192,11 @@ static void m01410m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e); wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f); - - const u32 r0 = d; - const u32 r1 = h; - const u32 r2 = c; - const u32 r3 = g; - - #include COMPARE_M + COMPARE_M_SIMD (d, h, c, g); } } -static void m01410s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m01410s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -229,37 +223,37 @@ static void m01410s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - u32 w0_t = w0; - u32 w1_t = w[ 1]; - u32 w2_t = w[ 2]; - u32 w3_t = w[ 3]; - u32 w4_t = w[ 4]; - u32 w5_t = w[ 5]; - u32 w6_t = w[ 6]; - u32 w7_t = w[ 7]; - u32 w8_t = w[ 8]; - u32 w9_t = w[ 9]; - u32 wa_t = w[10]; - u32 wb_t = w[11]; - u32 wc_t = w[12]; - u32 wd_t = w[13]; - u32 we_t = w[14]; - u32 wf_t = w[15]; + u32x w0_t = w0; + u32x w1_t = w[ 1]; + u32x w2_t = w[ 2]; + u32x w3_t = w[ 3]; + u32x w4_t = w[ 4]; + u32x w5_t = w[ 5]; + u32x w6_t = w[ 6]; + u32x w7_t = w[ 7]; + u32x w8_t = w[ 8]; + u32x w9_t = w[ 9]; + u32x wa_t = w[10]; + u32x wb_t = w[11]; + u32x wc_t = w[12]; + u32x wd_t = w[13]; + u32x we_t = w[14]; + u32x wf_t = w[15]; - u32 a = SHA256M_A; - u32 b = SHA256M_B; - u32 c = SHA256M_C; - u32 d = SHA256M_D; - u32 e = SHA256M_E; - u32 f = SHA256M_F; - u32 g = SHA256M_G; - u32 h = SHA256M_H; + u32x a = SHA256M_A; + u32x b = SHA256M_B; + u32x c = SHA256M_C; + u32x d = SHA256M_D; + u32x e = SHA256M_E; + u32x f = SHA256M_F; + u32x g = SHA256M_G; + u32x h = SHA256M_H; SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01); @@ -325,21 +319,18 @@ static void m01410s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k wa_t = SHA256_EXPAND (w8_t, w3_t, wb_t, wa_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, wa_t, SHA256C3a); wb_t = SHA256_EXPAND (w9_t, w4_t, wc_t, wb_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, wb_t, SHA256C3b); wc_t = SHA256_EXPAND (wa_t, w5_t, wd_t, wc_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, wc_t, SHA256C3c); + + if (MATCHES_NONE_VS (d, search[0])) continue; + wd_t = SHA256_EXPAND (wb_t, w6_t, we_t, wd_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, wd_t, SHA256C3d); we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e); wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f); - - const u32 r0 = d; - const u32 r1 = h; - const u32 r2 = c; - const u32 r3 = g; - - #include COMPARE_S + COMPARE_S_SIMD (d, h, c, g); } } -__kernel void m01410_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m01410_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -377,7 +368,7 @@ __kernel void m01410_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m01410m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m01410_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m01410_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -415,7 +406,7 @@ __kernel void m01410_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m01410m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m01410_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m01410_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -453,7 +444,7 @@ __kernel void m01410_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m01410m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m01410_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m01410_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -491,7 +482,7 @@ __kernel void m01410_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m01410s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m01410_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m01410_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -529,7 +520,7 @@ __kernel void m01410_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m01410s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m01410_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m01410_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base diff --git a/OpenCL/m01420_a0.cl b/OpenCL/m01420_a0.cl index c365a99d1..726b99268 100644 --- a/OpenCL/m01420_a0.cl +++ b/OpenCL/m01420_a0.cl @@ -116,7 +116,7 @@ __kernel void m01420_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, const u32 out_salt_len = out_len + salt_len; - switch_buffer_by_offset (w0, w1, w2, w3, salt_len); + switch_buffer_by_offset_le (w0, w1, w2, w3, salt_len); w0[0] |= salt_buf0[0]; w0[1] |= salt_buf0[1]; @@ -351,7 +351,7 @@ __kernel void m01420_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, const u32 out_salt_len = out_len + salt_len; - switch_buffer_by_offset (w0, w1, w2, w3, salt_len); + switch_buffer_by_offset_le (w0, w1, w2, w3, salt_len); w0[0] |= salt_buf0[0]; w0[1] |= salt_buf0[1]; diff --git a/OpenCL/m01420_a1.cl b/OpenCL/m01420_a1.cl index 92e668226..bf55faed9 100644 --- a/OpenCL/m01420_a1.cl +++ b/OpenCL/m01420_a1.cl @@ -68,7 +68,7 @@ __kernel void m01420_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -125,7 +125,7 @@ __kernel void m01420_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -156,7 +156,7 @@ __kernel void m01420_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, const u32 pw_salt_len = pw_len + salt_len; - switch_buffer_by_offset (w0, w1, w2, w3, salt_len); + switch_buffer_by_offset_le (w0, w1, w2, w3, salt_len); w0[0] |= salt_buf0[0]; w0[1] |= salt_buf0[1]; @@ -333,7 +333,7 @@ __kernel void m01420_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -402,7 +402,7 @@ __kernel void m01420_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -433,7 +433,7 @@ __kernel void m01420_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, const u32 pw_salt_len = pw_len + salt_len; - switch_buffer_by_offset (w0, w1, w2, w3, salt_len); + switch_buffer_by_offset_le (w0, w1, w2, w3, salt_len); w0[0] |= salt_buf0[0]; w0[1] |= salt_buf0[1]; diff --git a/OpenCL/m01420_a3.cl b/OpenCL/m01420_a3.cl index 21373593a..9250b0216 100644 --- a/OpenCL/m01420_a3.cl +++ b/OpenCL/m01420_a3.cl @@ -5,6 +5,8 @@ #define _SHA256_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,9 +18,7 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" - -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" +#include "OpenCL/simd.c" static void m01420m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { @@ -65,92 +65,126 @@ static void m01420m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le const u32 pw_salt_len = pw_len + salt_len; + /** + * prepend salt + */ + + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; + + w0_t[0] = swap32_S (w0[0]); + w0_t[1] = swap32_S (w0[1]); + w0_t[2] = swap32_S (w0[2]); + w0_t[3] = swap32_S (w0[3]); + w1_t[0] = swap32_S (w1[0]); + w1_t[1] = swap32_S (w1[1]); + w1_t[2] = swap32_S (w1[2]); + w1_t[3] = swap32_S (w1[3]); + w2_t[0] = swap32_S (w2[0]); + w2_t[1] = swap32_S (w2[1]); + w2_t[2] = swap32_S (w2[2]); + w2_t[3] = swap32_S (w2[3]); + w3_t[0] = swap32_S (w3[0]); + w3_t[1] = swap32_S (w3[1]); + w3_t[2] = swap32_S (w3[2]); + w3_t[3] = swap32_S (w3[3]); + + switch_buffer_by_offset_le_S (w0_t, w1_t, w2_t, w3_t, salt_len); + + w0_t[0] |= salt_buf0[0]; + w0_t[1] |= salt_buf0[1]; + w0_t[2] |= salt_buf0[2]; + w0_t[3] |= salt_buf0[3]; + w1_t[0] |= salt_buf1[0]; + w1_t[1] |= salt_buf1[1]; + w1_t[2] |= salt_buf1[2]; + w1_t[3] |= salt_buf1[3]; + w2_t[0] |= salt_buf2[0]; + w2_t[1] |= salt_buf2[1]; + w2_t[2] |= salt_buf2[2]; + w2_t[3] |= salt_buf2[3]; + w3_t[0] |= salt_buf3[0]; + w3_t[1] |= salt_buf3[1]; + w3_t[2] |= salt_buf3[2]; + w3_t[3] |= salt_buf3[3]; + + w0_t[0] = swap32_S (w0_t[0]); + w0_t[1] = swap32_S (w0_t[1]); + w0_t[2] = swap32_S (w0_t[2]); + w0_t[3] = swap32_S (w0_t[3]); + w1_t[0] = swap32_S (w1_t[0]); + w1_t[1] = swap32_S (w1_t[1]); + w1_t[2] = swap32_S (w1_t[2]); + w1_t[3] = swap32_S (w1_t[3]); + w2_t[0] = swap32_S (w2_t[0]); + w2_t[1] = swap32_S (w2_t[1]); + w2_t[2] = swap32_S (w2_t[2]); + w2_t[3] = swap32_S (w2_t[3]); + w3_t[0] = swap32_S (w3_t[0]); + w3_t[1] = swap32_S (w3_t[1]); + w3_t[2] = swap32_S (w3_t[2]); + w3_t[3] = swap32_S (w3_t[3]); + /** * loop */ u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - /** - * prepend salt - */ + u32x wx[16]; - u32 w0_t2[4]; - u32 w1_t2[4]; - u32 w2_t2[4]; - u32 w3_t2[4]; + wx[ 0] = w0_t[0]; + wx[ 1] = w0_t[1]; + wx[ 2] = w0_t[2]; + wx[ 3] = w0_t[3]; + wx[ 4] = w1_t[0]; + wx[ 5] = w1_t[1]; + wx[ 6] = w1_t[2]; + wx[ 7] = w1_t[3]; + wx[ 8] = w2_t[0]; + wx[ 9] = w2_t[1]; + wx[10] = w2_t[2]; + wx[11] = w2_t[3]; + wx[12] = w3_t[0]; + wx[13] = w3_t[1]; + wx[14] = w3_t[2]; + wx[15] = w3_t[3]; - w0_t2[0] = swap32 (w0[0]); - w0_t2[1] = swap32 (w0[1]); - w0_t2[2] = swap32 (w0[2]); - w0_t2[3] = swap32 (w0[3]); - w1_t2[0] = swap32 (w1[0]); - w1_t2[1] = swap32 (w1[1]); - w1_t2[2] = swap32 (w1[2]); - w1_t2[3] = swap32 (w1[3]); - w2_t2[0] = swap32 (w2[0]); - w2_t2[1] = swap32 (w2[1]); - w2_t2[2] = swap32 (w2[2]); - w2_t2[3] = swap32 (w2[3]); - w3_t2[0] = swap32 (w3[0]); - w3_t2[1] = swap32 (w3[1]); - w3_t2[2] = swap32 (w3[2]); - w3_t2[3] = swap32 (w3[3]); + overwrite_at_be (wx, w0lr, salt_len); - switch_buffer_by_offset (w0_t2, w1_t2, w2_t2, w3_t2, salt_len); + u32x w0_t = wx[ 0]; + u32x w1_t = wx[ 1]; + u32x w2_t = wx[ 2]; + u32x w3_t = wx[ 3]; + u32x w4_t = wx[ 4]; + u32x w5_t = wx[ 5]; + u32x w6_t = wx[ 6]; + u32x w7_t = wx[ 7]; + u32x w8_t = wx[ 8]; + u32x w9_t = wx[ 9]; + u32x wa_t = wx[10]; + u32x wb_t = wx[11]; + u32x wc_t = wx[12]; + u32x wd_t = wx[13]; + u32x we_t = 0; + u32x wf_t = pw_salt_len * 8; - w0_t2[0] |= salt_buf0[0]; - w0_t2[1] |= salt_buf0[1]; - w0_t2[2] |= salt_buf0[2]; - w0_t2[3] |= salt_buf0[3]; - w1_t2[0] |= salt_buf1[0]; - w1_t2[1] |= salt_buf1[1]; - w1_t2[2] |= salt_buf1[2]; - w1_t2[3] |= salt_buf1[3]; - w2_t2[0] |= salt_buf2[0]; - w2_t2[1] |= salt_buf2[1]; - w2_t2[2] |= salt_buf2[2]; - w2_t2[3] |= salt_buf2[3]; - w3_t2[0] |= salt_buf3[0]; - w3_t2[1] |= salt_buf3[1]; - w3_t2[2] |= salt_buf3[2]; - w3_t2[3] |= salt_buf3[3]; - - /** - * sha256 - */ - - u32 w0_t = swap32 (w0_t2[0]); - u32 w1_t = swap32 (w0_t2[1]); - u32 w2_t = swap32 (w0_t2[2]); - u32 w3_t = swap32 (w0_t2[3]); - u32 w4_t = swap32 (w1_t2[0]); - u32 w5_t = swap32 (w1_t2[1]); - u32 w6_t = swap32 (w1_t2[2]); - u32 w7_t = swap32 (w1_t2[3]); - u32 w8_t = swap32 (w2_t2[0]); - u32 w9_t = swap32 (w2_t2[1]); - u32 wa_t = swap32 (w2_t2[2]); - u32 wb_t = swap32 (w2_t2[3]); - u32 wc_t = swap32 (w3_t2[0]); - u32 wd_t = swap32 (w3_t2[1]); - u32 we_t = 0; - u32 wf_t = pw_salt_len * 8; - - u32 a = SHA256M_A; - u32 b = SHA256M_B; - u32 c = SHA256M_C; - u32 d = SHA256M_D; - u32 e = SHA256M_E; - u32 f = SHA256M_F; - u32 g = SHA256M_G; - u32 h = SHA256M_H; + u32x a = SHA256M_A; + u32x b = SHA256M_B; + u32x c = SHA256M_C; + u32x d = SHA256M_D; + u32x e = SHA256M_E; + u32x f = SHA256M_F; + u32x g = SHA256M_G; + u32x h = SHA256M_H; SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01); @@ -220,13 +254,7 @@ static void m01420m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e); wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f); - - const u32 r0 = d; - const u32 r1 = h; - const u32 r2 = c; - const u32 r3 = g; - - #include COMPARE_M + COMPARE_M_SIMD (d, h, c, g); } } @@ -287,92 +315,126 @@ static void m01420s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le const u32 pw_salt_len = pw_len + salt_len; + /** + * prepend salt + */ + + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; + + w0_t[0] = swap32_S (w0[0]); + w0_t[1] = swap32_S (w0[1]); + w0_t[2] = swap32_S (w0[2]); + w0_t[3] = swap32_S (w0[3]); + w1_t[0] = swap32_S (w1[0]); + w1_t[1] = swap32_S (w1[1]); + w1_t[2] = swap32_S (w1[2]); + w1_t[3] = swap32_S (w1[3]); + w2_t[0] = swap32_S (w2[0]); + w2_t[1] = swap32_S (w2[1]); + w2_t[2] = swap32_S (w2[2]); + w2_t[3] = swap32_S (w2[3]); + w3_t[0] = swap32_S (w3[0]); + w3_t[1] = swap32_S (w3[1]); + w3_t[2] = swap32_S (w3[2]); + w3_t[3] = swap32_S (w3[3]); + + switch_buffer_by_offset_le_S (w0_t, w1_t, w2_t, w3_t, salt_len); + + w0_t[0] |= salt_buf0[0]; + w0_t[1] |= salt_buf0[1]; + w0_t[2] |= salt_buf0[2]; + w0_t[3] |= salt_buf0[3]; + w1_t[0] |= salt_buf1[0]; + w1_t[1] |= salt_buf1[1]; + w1_t[2] |= salt_buf1[2]; + w1_t[3] |= salt_buf1[3]; + w2_t[0] |= salt_buf2[0]; + w2_t[1] |= salt_buf2[1]; + w2_t[2] |= salt_buf2[2]; + w2_t[3] |= salt_buf2[3]; + w3_t[0] |= salt_buf3[0]; + w3_t[1] |= salt_buf3[1]; + w3_t[2] |= salt_buf3[2]; + w3_t[3] |= salt_buf3[3]; + + w0_t[0] = swap32_S (w0_t[0]); + w0_t[1] = swap32_S (w0_t[1]); + w0_t[2] = swap32_S (w0_t[2]); + w0_t[3] = swap32_S (w0_t[3]); + w1_t[0] = swap32_S (w1_t[0]); + w1_t[1] = swap32_S (w1_t[1]); + w1_t[2] = swap32_S (w1_t[2]); + w1_t[3] = swap32_S (w1_t[3]); + w2_t[0] = swap32_S (w2_t[0]); + w2_t[1] = swap32_S (w2_t[1]); + w2_t[2] = swap32_S (w2_t[2]); + w2_t[3] = swap32_S (w2_t[3]); + w3_t[0] = swap32_S (w3_t[0]); + w3_t[1] = swap32_S (w3_t[1]); + w3_t[2] = swap32_S (w3_t[2]); + w3_t[3] = swap32_S (w3_t[3]); + /** * loop */ u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - /** - * prepend salt - */ + u32x wx[16]; - u32 w0_t2[4]; - u32 w1_t2[4]; - u32 w2_t2[4]; - u32 w3_t2[4]; + wx[ 0] = w0_t[0]; + wx[ 1] = w0_t[1]; + wx[ 2] = w0_t[2]; + wx[ 3] = w0_t[3]; + wx[ 4] = w1_t[0]; + wx[ 5] = w1_t[1]; + wx[ 6] = w1_t[2]; + wx[ 7] = w1_t[3]; + wx[ 8] = w2_t[0]; + wx[ 9] = w2_t[1]; + wx[10] = w2_t[2]; + wx[11] = w2_t[3]; + wx[12] = w3_t[0]; + wx[13] = w3_t[1]; + wx[14] = w3_t[2]; + wx[15] = w3_t[3]; - w0_t2[0] = swap32 (w0[0]); - w0_t2[1] = swap32 (w0[1]); - w0_t2[2] = swap32 (w0[2]); - w0_t2[3] = swap32 (w0[3]); - w1_t2[0] = swap32 (w1[0]); - w1_t2[1] = swap32 (w1[1]); - w1_t2[2] = swap32 (w1[2]); - w1_t2[3] = swap32 (w1[3]); - w2_t2[0] = swap32 (w2[0]); - w2_t2[1] = swap32 (w2[1]); - w2_t2[2] = swap32 (w2[2]); - w2_t2[3] = swap32 (w2[3]); - w3_t2[0] = swap32 (w3[0]); - w3_t2[1] = swap32 (w3[1]); - w3_t2[2] = swap32 (w3[2]); - w3_t2[3] = swap32 (w3[3]); + overwrite_at_be (wx, w0lr, salt_len); - switch_buffer_by_offset (w0_t2, w1_t2, w2_t2, w3_t2, salt_len); + u32x w0_t = wx[ 0]; + u32x w1_t = wx[ 1]; + u32x w2_t = wx[ 2]; + u32x w3_t = wx[ 3]; + u32x w4_t = wx[ 4]; + u32x w5_t = wx[ 5]; + u32x w6_t = wx[ 6]; + u32x w7_t = wx[ 7]; + u32x w8_t = wx[ 8]; + u32x w9_t = wx[ 9]; + u32x wa_t = wx[10]; + u32x wb_t = wx[11]; + u32x wc_t = wx[12]; + u32x wd_t = wx[13]; + u32x we_t = 0; + u32x wf_t = pw_salt_len * 8; - w0_t2[0] |= salt_buf0[0]; - w0_t2[1] |= salt_buf0[1]; - w0_t2[2] |= salt_buf0[2]; - w0_t2[3] |= salt_buf0[3]; - w1_t2[0] |= salt_buf1[0]; - w1_t2[1] |= salt_buf1[1]; - w1_t2[2] |= salt_buf1[2]; - w1_t2[3] |= salt_buf1[3]; - w2_t2[0] |= salt_buf2[0]; - w2_t2[1] |= salt_buf2[1]; - w2_t2[2] |= salt_buf2[2]; - w2_t2[3] |= salt_buf2[3]; - w3_t2[0] |= salt_buf3[0]; - w3_t2[1] |= salt_buf3[1]; - w3_t2[2] |= salt_buf3[2]; - w3_t2[3] |= salt_buf3[3]; - - /** - * sha256 - */ - - u32 w0_t = swap32 (w0_t2[0]); - u32 w1_t = swap32 (w0_t2[1]); - u32 w2_t = swap32 (w0_t2[2]); - u32 w3_t = swap32 (w0_t2[3]); - u32 w4_t = swap32 (w1_t2[0]); - u32 w5_t = swap32 (w1_t2[1]); - u32 w6_t = swap32 (w1_t2[2]); - u32 w7_t = swap32 (w1_t2[3]); - u32 w8_t = swap32 (w2_t2[0]); - u32 w9_t = swap32 (w2_t2[1]); - u32 wa_t = swap32 (w2_t2[2]); - u32 wb_t = swap32 (w2_t2[3]); - u32 wc_t = swap32 (w3_t2[0]); - u32 wd_t = swap32 (w3_t2[1]); - u32 we_t = 0; - u32 wf_t = pw_salt_len * 8; - - u32 a = SHA256M_A; - u32 b = SHA256M_B; - u32 c = SHA256M_C; - u32 d = SHA256M_D; - u32 e = SHA256M_E; - u32 f = SHA256M_F; - u32 g = SHA256M_G; - u32 h = SHA256M_H; + u32x a = SHA256M_A; + u32x b = SHA256M_B; + u32x c = SHA256M_C; + u32x d = SHA256M_D; + u32x e = SHA256M_E; + u32x f = SHA256M_F; + u32x g = SHA256M_G; + u32x h = SHA256M_H; SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01); @@ -438,17 +500,14 @@ static void m01420s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le wa_t = SHA256_EXPAND (w8_t, w3_t, wb_t, wa_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, wa_t, SHA256C3a); wb_t = SHA256_EXPAND (w9_t, w4_t, wc_t, wb_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, wb_t, SHA256C3b); wc_t = SHA256_EXPAND (wa_t, w5_t, wd_t, wc_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, wc_t, SHA256C3c); + + if (MATCHES_NONE_VS (d, search[0])) continue; + wd_t = SHA256_EXPAND (wb_t, w6_t, we_t, wd_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, wd_t, SHA256C3d); we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e); wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f); - - const u32 r0 = d; - const u32 r1 = h; - const u32 r2 = c; - const u32 r3 = g; - - #include COMPARE_S + COMPARE_S_SIMD (d, h, c, g); } } diff --git a/OpenCL/m01430_a0.cl b/OpenCL/m01430_a0.cl index bf8bc4a14..894c8e482 100644 --- a/OpenCL/m01430_a0.cl +++ b/OpenCL/m01430_a0.cl @@ -142,7 +142,7 @@ __kernel void m01430_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, (out_len * 2)); + switch_buffer_by_offset_le (s0, s1, s2, s3, (out_len * 2)); const u32 out_salt_len = (out_len * 2) + salt_len; @@ -421,7 +421,7 @@ __kernel void m01430_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, (out_len * 2)); + switch_buffer_by_offset_le (s0, s1, s2, s3, (out_len * 2)); const u32 out_salt_len = (out_len * 2) + salt_len; diff --git a/OpenCL/m01430_a1.cl b/OpenCL/m01430_a1.cl index aa038a0ed..01538ec44 100644 --- a/OpenCL/m01430_a1.cl +++ b/OpenCL/m01430_a1.cl @@ -68,7 +68,7 @@ __kernel void m01430_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -125,7 +125,7 @@ __kernel void m01430_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -182,7 +182,7 @@ __kernel void m01430_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, (pw_len * 2)); + switch_buffer_by_offset_le (s0, s1, s2, s3, (pw_len * 2)); const u32 pw_salt_len = (pw_len * 2) + salt_len; @@ -377,7 +377,7 @@ __kernel void m01430_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -446,7 +446,7 @@ __kernel void m01430_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -503,7 +503,7 @@ __kernel void m01430_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, (pw_len * 2)); + switch_buffer_by_offset_le (s0, s1, s2, s3, (pw_len * 2)); const u32 pw_salt_len = (pw_len * 2) + salt_len; diff --git a/OpenCL/m01430_a3.cl b/OpenCL/m01430_a3.cl index 807a2b927..47b61332c 100644 --- a/OpenCL/m01430_a3.cl +++ b/OpenCL/m01430_a3.cl @@ -5,6 +5,8 @@ #define _SHA256_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,11 +18,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" +#include "OpenCL/simd.c" -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" - -static void m01430m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m01430m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -61,24 +61,24 @@ static void m01430m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k salt_buf3[2] = 0; salt_buf3[3] = 0; - switch_buffer_by_offset (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len); + switch_buffer_by_offset_le_S (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len); - w[ 0] |= swap32 (salt_buf0[0]); - w[ 1] |= swap32 (salt_buf0[1]); - w[ 2] |= swap32 (salt_buf0[2]); - w[ 3] |= swap32 (salt_buf0[3]); - w[ 4] |= swap32 (salt_buf1[0]); - w[ 5] |= swap32 (salt_buf1[1]); - w[ 6] |= swap32 (salt_buf1[2]); - w[ 7] |= swap32 (salt_buf1[3]); - w[ 8] |= swap32 (salt_buf2[0]); - w[ 9] |= swap32 (salt_buf2[1]); - w[10] |= swap32 (salt_buf2[2]); - w[11] |= swap32 (salt_buf2[3]); - w[12] |= swap32 (salt_buf3[0]); - w[13] |= swap32 (salt_buf3[1]); - w[14] |= swap32 (salt_buf3[2]); - w[15] |= swap32 (salt_buf3[3]); + w[ 0] |= swap32_S (salt_buf0[0]); + w[ 1] |= swap32_S (salt_buf0[1]); + w[ 2] |= swap32_S (salt_buf0[2]); + w[ 3] |= swap32_S (salt_buf0[3]); + w[ 4] |= swap32_S (salt_buf1[0]); + w[ 5] |= swap32_S (salt_buf1[1]); + w[ 6] |= swap32_S (salt_buf1[2]); + w[ 7] |= swap32_S (salt_buf1[3]); + w[ 8] |= swap32_S (salt_buf2[0]); + w[ 9] |= swap32_S (salt_buf2[1]); + w[10] |= swap32_S (salt_buf2[2]); + w[11] |= swap32_S (salt_buf2[3]); + w[12] |= swap32_S (salt_buf3[0]); + w[13] |= swap32_S (salt_buf3[1]); + w[14] |= swap32_S (salt_buf3[2]); + w[15] |= swap32_S (salt_buf3[3]); const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -92,37 +92,37 @@ static void m01430m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - u32 w0_t = w0; - u32 w1_t = w[ 1]; - u32 w2_t = w[ 2]; - u32 w3_t = w[ 3]; - u32 w4_t = w[ 4]; - u32 w5_t = w[ 5]; - u32 w6_t = w[ 6]; - u32 w7_t = w[ 7]; - u32 w8_t = w[ 8]; - u32 w9_t = w[ 9]; - u32 wa_t = w[10]; - u32 wb_t = w[11]; - u32 wc_t = w[12]; - u32 wd_t = w[13]; - u32 we_t = w[14]; - u32 wf_t = w[15]; + u32x w0_t = w0; + u32x w1_t = w[ 1]; + u32x w2_t = w[ 2]; + u32x w3_t = w[ 3]; + u32x w4_t = w[ 4]; + u32x w5_t = w[ 5]; + u32x w6_t = w[ 6]; + u32x w7_t = w[ 7]; + u32x w8_t = w[ 8]; + u32x w9_t = w[ 9]; + u32x wa_t = w[10]; + u32x wb_t = w[11]; + u32x wc_t = w[12]; + u32x wd_t = w[13]; + u32x we_t = w[14]; + u32x wf_t = w[15]; - u32 a = SHA256M_A; - u32 b = SHA256M_B; - u32 c = SHA256M_C; - u32 d = SHA256M_D; - u32 e = SHA256M_E; - u32 f = SHA256M_F; - u32 g = SHA256M_G; - u32 h = SHA256M_H; + u32x a = SHA256M_A; + u32x b = SHA256M_B; + u32x c = SHA256M_C; + u32x d = SHA256M_D; + u32x e = SHA256M_E; + u32x f = SHA256M_F; + u32x g = SHA256M_G; + u32x h = SHA256M_H; SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01); @@ -192,17 +192,11 @@ static void m01430m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e); wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f); - - const u32 r0 = d; - const u32 r1 = h; - const u32 r2 = c; - const u32 r3 = g; - - #include COMPARE_M + COMPARE_M_SIMD (d, h, c, g); } } -static void m01430s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m01430s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -229,37 +223,37 @@ static void m01430s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - u32 w0_t = w0; - u32 w1_t = w[ 1]; - u32 w2_t = w[ 2]; - u32 w3_t = w[ 3]; - u32 w4_t = w[ 4]; - u32 w5_t = w[ 5]; - u32 w6_t = w[ 6]; - u32 w7_t = w[ 7]; - u32 w8_t = w[ 8]; - u32 w9_t = w[ 9]; - u32 wa_t = w[10]; - u32 wb_t = w[11]; - u32 wc_t = w[12]; - u32 wd_t = w[13]; - u32 we_t = w[14]; - u32 wf_t = w[15]; + u32x w0_t = w0; + u32x w1_t = w[ 1]; + u32x w2_t = w[ 2]; + u32x w3_t = w[ 3]; + u32x w4_t = w[ 4]; + u32x w5_t = w[ 5]; + u32x w6_t = w[ 6]; + u32x w7_t = w[ 7]; + u32x w8_t = w[ 8]; + u32x w9_t = w[ 9]; + u32x wa_t = w[10]; + u32x wb_t = w[11]; + u32x wc_t = w[12]; + u32x wd_t = w[13]; + u32x we_t = w[14]; + u32x wf_t = w[15]; - u32 a = SHA256M_A; - u32 b = SHA256M_B; - u32 c = SHA256M_C; - u32 d = SHA256M_D; - u32 e = SHA256M_E; - u32 f = SHA256M_F; - u32 g = SHA256M_G; - u32 h = SHA256M_H; + u32x a = SHA256M_A; + u32x b = SHA256M_B; + u32x c = SHA256M_C; + u32x d = SHA256M_D; + u32x e = SHA256M_E; + u32x f = SHA256M_F; + u32x g = SHA256M_G; + u32x h = SHA256M_H; SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01); @@ -325,21 +319,18 @@ static void m01430s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k wa_t = SHA256_EXPAND (w8_t, w3_t, wb_t, wa_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, wa_t, SHA256C3a); wb_t = SHA256_EXPAND (w9_t, w4_t, wc_t, wb_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, wb_t, SHA256C3b); wc_t = SHA256_EXPAND (wa_t, w5_t, wd_t, wc_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, wc_t, SHA256C3c); + + if (MATCHES_NONE_VS (d, search[0])) continue; + wd_t = SHA256_EXPAND (wb_t, w6_t, we_t, wd_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, wd_t, SHA256C3d); we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e); wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f); - - const u32 r0 = d; - const u32 r1 = h; - const u32 r2 = c; - const u32 r3 = g; - - #include COMPARE_S + COMPARE_S_SIMD (d, h, c, g); } } -__kernel void m01430_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m01430_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -377,7 +368,7 @@ __kernel void m01430_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m01430m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m01430_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m01430_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -415,7 +406,7 @@ __kernel void m01430_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m01430m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m01430_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m01430_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -453,7 +444,7 @@ __kernel void m01430_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m01430m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m01430_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m01430_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -491,7 +482,7 @@ __kernel void m01430_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m01430s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m01430_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m01430_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -529,7 +520,7 @@ __kernel void m01430_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m01430s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m01430_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m01430_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base diff --git a/OpenCL/m01440_a0.cl b/OpenCL/m01440_a0.cl index 80b644eb6..c490b3c06 100644 --- a/OpenCL/m01440_a0.cl +++ b/OpenCL/m01440_a0.cl @@ -118,7 +118,7 @@ __kernel void m01440_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, make_unicode (w0, w0_t2, w1_t2); make_unicode (w1, w2_t2, w3_t2); - switch_buffer_by_offset (w0_t2, w1_t2, w2_t2, w3_t2, salt_len); + switch_buffer_by_offset_le (w0_t2, w1_t2, w2_t2, w3_t2, salt_len); w0_t2[0] |= salt_buf0[0]; w0_t2[1] |= salt_buf0[1]; @@ -355,7 +355,7 @@ __kernel void m01440_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, make_unicode (w0, w0_t2, w1_t2); make_unicode (w1, w2_t2, w3_t2); - switch_buffer_by_offset (w0_t2, w1_t2, w2_t2, w3_t2, salt_len); + switch_buffer_by_offset_le (w0_t2, w1_t2, w2_t2, w3_t2, salt_len); w0_t2[0] |= salt_buf0[0]; w0_t2[1] |= salt_buf0[1]; diff --git a/OpenCL/m01440_a1.cl b/OpenCL/m01440_a1.cl index 3c2913130..be82ddfe9 100644 --- a/OpenCL/m01440_a1.cl +++ b/OpenCL/m01440_a1.cl @@ -68,7 +68,7 @@ __kernel void m01440_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -125,7 +125,7 @@ __kernel void m01440_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -164,7 +164,7 @@ __kernel void m01440_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, make_unicode (w0, w0_t2, w1_t2); make_unicode (w1, w2_t2, w3_t2); - switch_buffer_by_offset (w0_t2, w1_t2, w2_t2, w3_t2, salt_len); + switch_buffer_by_offset_le (w0_t2, w1_t2, w2_t2, w3_t2, salt_len); w0_t2[0] |= salt_buf0[0]; w0_t2[1] |= salt_buf0[1]; @@ -341,7 +341,7 @@ __kernel void m01440_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -410,7 +410,7 @@ __kernel void m01440_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -449,7 +449,7 @@ __kernel void m01440_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, make_unicode (w0, w0_t2, w1_t2); make_unicode (w1, w2_t2, w3_t2); - switch_buffer_by_offset (w0_t2, w1_t2, w2_t2, w3_t2, salt_len); + switch_buffer_by_offset_le (w0_t2, w1_t2, w2_t2, w3_t2, salt_len); w0_t2[0] |= salt_buf0[0]; w0_t2[1] |= salt_buf0[1]; diff --git a/OpenCL/m01440_a3.cl b/OpenCL/m01440_a3.cl index 37ceeb012..b8289d0a9 100644 --- a/OpenCL/m01440_a3.cl +++ b/OpenCL/m01440_a3.cl @@ -5,6 +5,8 @@ #define _SHA256_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,9 +18,7 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" - -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" +#include "OpenCL/simd.c" static void m01440m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { @@ -65,92 +65,126 @@ static void m01440m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le const u32 pw_salt_len = pw_len + salt_len; + /** + * prepend salt + */ + + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; + + w0_t[0] = swap32_S (w0[0]); + w0_t[1] = swap32_S (w0[1]); + w0_t[2] = swap32_S (w0[2]); + w0_t[3] = swap32_S (w0[3]); + w1_t[0] = swap32_S (w1[0]); + w1_t[1] = swap32_S (w1[1]); + w1_t[2] = swap32_S (w1[2]); + w1_t[3] = swap32_S (w1[3]); + w2_t[0] = swap32_S (w2[0]); + w2_t[1] = swap32_S (w2[1]); + w2_t[2] = swap32_S (w2[2]); + w2_t[3] = swap32_S (w2[3]); + w3_t[0] = swap32_S (w3[0]); + w3_t[1] = swap32_S (w3[1]); + w3_t[2] = swap32_S (w3[2]); + w3_t[3] = swap32_S (w3[3]); + + switch_buffer_by_offset_le_S (w0_t, w1_t, w2_t, w3_t, salt_len); + + w0_t[0] |= salt_buf0[0]; + w0_t[1] |= salt_buf0[1]; + w0_t[2] |= salt_buf0[2]; + w0_t[3] |= salt_buf0[3]; + w1_t[0] |= salt_buf1[0]; + w1_t[1] |= salt_buf1[1]; + w1_t[2] |= salt_buf1[2]; + w1_t[3] |= salt_buf1[3]; + w2_t[0] |= salt_buf2[0]; + w2_t[1] |= salt_buf2[1]; + w2_t[2] |= salt_buf2[2]; + w2_t[3] |= salt_buf2[3]; + w3_t[0] |= salt_buf3[0]; + w3_t[1] |= salt_buf3[1]; + w3_t[2] |= salt_buf3[2]; + w3_t[3] |= salt_buf3[3]; + + w0_t[0] = swap32_S (w0_t[0]); + w0_t[1] = swap32_S (w0_t[1]); + w0_t[2] = swap32_S (w0_t[2]); + w0_t[3] = swap32_S (w0_t[3]); + w1_t[0] = swap32_S (w1_t[0]); + w1_t[1] = swap32_S (w1_t[1]); + w1_t[2] = swap32_S (w1_t[2]); + w1_t[3] = swap32_S (w1_t[3]); + w2_t[0] = swap32_S (w2_t[0]); + w2_t[1] = swap32_S (w2_t[1]); + w2_t[2] = swap32_S (w2_t[2]); + w2_t[3] = swap32_S (w2_t[3]); + w3_t[0] = swap32_S (w3_t[0]); + w3_t[1] = swap32_S (w3_t[1]); + w3_t[2] = swap32_S (w3_t[2]); + w3_t[3] = swap32_S (w3_t[3]); + /** * loop */ u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - /** - * prepend salt - */ + u32x wx[16]; - u32 w0_t2[4]; - u32 w1_t2[4]; - u32 w2_t2[4]; - u32 w3_t2[4]; + wx[ 0] = w0_t[0]; + wx[ 1] = w0_t[1]; + wx[ 2] = w0_t[2]; + wx[ 3] = w0_t[3]; + wx[ 4] = w1_t[0]; + wx[ 5] = w1_t[1]; + wx[ 6] = w1_t[2]; + wx[ 7] = w1_t[3]; + wx[ 8] = w2_t[0]; + wx[ 9] = w2_t[1]; + wx[10] = w2_t[2]; + wx[11] = w2_t[3]; + wx[12] = w3_t[0]; + wx[13] = w3_t[1]; + wx[14] = w3_t[2]; + wx[15] = w3_t[3]; - w0_t2[0] = swap32 (w0[0]); - w0_t2[1] = swap32 (w0[1]); - w0_t2[2] = swap32 (w0[2]); - w0_t2[3] = swap32 (w0[3]); - w1_t2[0] = swap32 (w1[0]); - w1_t2[1] = swap32 (w1[1]); - w1_t2[2] = swap32 (w1[2]); - w1_t2[3] = swap32 (w1[3]); - w2_t2[0] = swap32 (w2[0]); - w2_t2[1] = swap32 (w2[1]); - w2_t2[2] = swap32 (w2[2]); - w2_t2[3] = swap32 (w2[3]); - w3_t2[0] = swap32 (w3[0]); - w3_t2[1] = swap32 (w3[1]); - w3_t2[2] = swap32 (w3[2]); - w3_t2[3] = swap32 (w3[3]); + overwrite_at_be (wx, w0lr, salt_len); - switch_buffer_by_offset (w0_t2, w1_t2, w2_t2, w3_t2, salt_len); + u32x w0_t = wx[ 0]; + u32x w1_t = wx[ 1]; + u32x w2_t = wx[ 2]; + u32x w3_t = wx[ 3]; + u32x w4_t = wx[ 4]; + u32x w5_t = wx[ 5]; + u32x w6_t = wx[ 6]; + u32x w7_t = wx[ 7]; + u32x w8_t = wx[ 8]; + u32x w9_t = wx[ 9]; + u32x wa_t = wx[10]; + u32x wb_t = wx[11]; + u32x wc_t = wx[12]; + u32x wd_t = wx[13]; + u32x we_t = 0; + u32x wf_t = pw_salt_len * 8; - w0_t2[0] |= salt_buf0[0]; - w0_t2[1] |= salt_buf0[1]; - w0_t2[2] |= salt_buf0[2]; - w0_t2[3] |= salt_buf0[3]; - w1_t2[0] |= salt_buf1[0]; - w1_t2[1] |= salt_buf1[1]; - w1_t2[2] |= salt_buf1[2]; - w1_t2[3] |= salt_buf1[3]; - w2_t2[0] |= salt_buf2[0]; - w2_t2[1] |= salt_buf2[1]; - w2_t2[2] |= salt_buf2[2]; - w2_t2[3] |= salt_buf2[3]; - w3_t2[0] |= salt_buf3[0]; - w3_t2[1] |= salt_buf3[1]; - w3_t2[2] |= salt_buf3[2]; - w3_t2[3] |= salt_buf3[3]; - - /** - * sha256 - */ - - u32 w0_t = swap32 (w0_t2[0]); - u32 w1_t = swap32 (w0_t2[1]); - u32 w2_t = swap32 (w0_t2[2]); - u32 w3_t = swap32 (w0_t2[3]); - u32 w4_t = swap32 (w1_t2[0]); - u32 w5_t = swap32 (w1_t2[1]); - u32 w6_t = swap32 (w1_t2[2]); - u32 w7_t = swap32 (w1_t2[3]); - u32 w8_t = swap32 (w2_t2[0]); - u32 w9_t = swap32 (w2_t2[1]); - u32 wa_t = swap32 (w2_t2[2]); - u32 wb_t = swap32 (w2_t2[3]); - u32 wc_t = swap32 (w3_t2[0]); - u32 wd_t = swap32 (w3_t2[1]); - u32 we_t = 0; - u32 wf_t = pw_salt_len * 8; - - u32 a = SHA256M_A; - u32 b = SHA256M_B; - u32 c = SHA256M_C; - u32 d = SHA256M_D; - u32 e = SHA256M_E; - u32 f = SHA256M_F; - u32 g = SHA256M_G; - u32 h = SHA256M_H; + u32x a = SHA256M_A; + u32x b = SHA256M_B; + u32x c = SHA256M_C; + u32x d = SHA256M_D; + u32x e = SHA256M_E; + u32x f = SHA256M_F; + u32x g = SHA256M_G; + u32x h = SHA256M_H; SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01); @@ -220,13 +254,7 @@ static void m01440m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e); wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f); - - const u32 r0 = d; - const u32 r1 = h; - const u32 r2 = c; - const u32 r3 = g; - - #include COMPARE_M + COMPARE_M_SIMD (d, h, c, g); } } @@ -287,92 +315,126 @@ static void m01440s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le const u32 pw_salt_len = pw_len + salt_len; + /** + * prepend salt + */ + + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; + + w0_t[0] = swap32_S (w0[0]); + w0_t[1] = swap32_S (w0[1]); + w0_t[2] = swap32_S (w0[2]); + w0_t[3] = swap32_S (w0[3]); + w1_t[0] = swap32_S (w1[0]); + w1_t[1] = swap32_S (w1[1]); + w1_t[2] = swap32_S (w1[2]); + w1_t[3] = swap32_S (w1[3]); + w2_t[0] = swap32_S (w2[0]); + w2_t[1] = swap32_S (w2[1]); + w2_t[2] = swap32_S (w2[2]); + w2_t[3] = swap32_S (w2[3]); + w3_t[0] = swap32_S (w3[0]); + w3_t[1] = swap32_S (w3[1]); + w3_t[2] = swap32_S (w3[2]); + w3_t[3] = swap32_S (w3[3]); + + switch_buffer_by_offset_le_S (w0_t, w1_t, w2_t, w3_t, salt_len); + + w0_t[0] |= salt_buf0[0]; + w0_t[1] |= salt_buf0[1]; + w0_t[2] |= salt_buf0[2]; + w0_t[3] |= salt_buf0[3]; + w1_t[0] |= salt_buf1[0]; + w1_t[1] |= salt_buf1[1]; + w1_t[2] |= salt_buf1[2]; + w1_t[3] |= salt_buf1[3]; + w2_t[0] |= salt_buf2[0]; + w2_t[1] |= salt_buf2[1]; + w2_t[2] |= salt_buf2[2]; + w2_t[3] |= salt_buf2[3]; + w3_t[0] |= salt_buf3[0]; + w3_t[1] |= salt_buf3[1]; + w3_t[2] |= salt_buf3[2]; + w3_t[3] |= salt_buf3[3]; + + w0_t[0] = swap32_S (w0_t[0]); + w0_t[1] = swap32_S (w0_t[1]); + w0_t[2] = swap32_S (w0_t[2]); + w0_t[3] = swap32_S (w0_t[3]); + w1_t[0] = swap32_S (w1_t[0]); + w1_t[1] = swap32_S (w1_t[1]); + w1_t[2] = swap32_S (w1_t[2]); + w1_t[3] = swap32_S (w1_t[3]); + w2_t[0] = swap32_S (w2_t[0]); + w2_t[1] = swap32_S (w2_t[1]); + w2_t[2] = swap32_S (w2_t[2]); + w2_t[3] = swap32_S (w2_t[3]); + w3_t[0] = swap32_S (w3_t[0]); + w3_t[1] = swap32_S (w3_t[1]); + w3_t[2] = swap32_S (w3_t[2]); + w3_t[3] = swap32_S (w3_t[3]); + /** * loop */ u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - /** - * prepend salt - */ + u32x wx[16]; - u32 w0_t2[4]; - u32 w1_t2[4]; - u32 w2_t2[4]; - u32 w3_t2[4]; + wx[ 0] = w0_t[0]; + wx[ 1] = w0_t[1]; + wx[ 2] = w0_t[2]; + wx[ 3] = w0_t[3]; + wx[ 4] = w1_t[0]; + wx[ 5] = w1_t[1]; + wx[ 6] = w1_t[2]; + wx[ 7] = w1_t[3]; + wx[ 8] = w2_t[0]; + wx[ 9] = w2_t[1]; + wx[10] = w2_t[2]; + wx[11] = w2_t[3]; + wx[12] = w3_t[0]; + wx[13] = w3_t[1]; + wx[14] = w3_t[2]; + wx[15] = w3_t[3]; - w0_t2[0] = swap32 (w0[0]); - w0_t2[1] = swap32 (w0[1]); - w0_t2[2] = swap32 (w0[2]); - w0_t2[3] = swap32 (w0[3]); - w1_t2[0] = swap32 (w1[0]); - w1_t2[1] = swap32 (w1[1]); - w1_t2[2] = swap32 (w1[2]); - w1_t2[3] = swap32 (w1[3]); - w2_t2[0] = swap32 (w2[0]); - w2_t2[1] = swap32 (w2[1]); - w2_t2[2] = swap32 (w2[2]); - w2_t2[3] = swap32 (w2[3]); - w3_t2[0] = swap32 (w3[0]); - w3_t2[1] = swap32 (w3[1]); - w3_t2[2] = swap32 (w3[2]); - w3_t2[3] = swap32 (w3[3]); + overwrite_at_be (wx, w0lr, salt_len); - switch_buffer_by_offset (w0_t2, w1_t2, w2_t2, w3_t2, salt_len); + u32x w0_t = wx[ 0]; + u32x w1_t = wx[ 1]; + u32x w2_t = wx[ 2]; + u32x w3_t = wx[ 3]; + u32x w4_t = wx[ 4]; + u32x w5_t = wx[ 5]; + u32x w6_t = wx[ 6]; + u32x w7_t = wx[ 7]; + u32x w8_t = wx[ 8]; + u32x w9_t = wx[ 9]; + u32x wa_t = wx[10]; + u32x wb_t = wx[11]; + u32x wc_t = wx[12]; + u32x wd_t = wx[13]; + u32x we_t = 0; + u32x wf_t = pw_salt_len * 8; - w0_t2[0] |= salt_buf0[0]; - w0_t2[1] |= salt_buf0[1]; - w0_t2[2] |= salt_buf0[2]; - w0_t2[3] |= salt_buf0[3]; - w1_t2[0] |= salt_buf1[0]; - w1_t2[1] |= salt_buf1[1]; - w1_t2[2] |= salt_buf1[2]; - w1_t2[3] |= salt_buf1[3]; - w2_t2[0] |= salt_buf2[0]; - w2_t2[1] |= salt_buf2[1]; - w2_t2[2] |= salt_buf2[2]; - w2_t2[3] |= salt_buf2[3]; - w3_t2[0] |= salt_buf3[0]; - w3_t2[1] |= salt_buf3[1]; - w3_t2[2] |= salt_buf3[2]; - w3_t2[3] |= salt_buf3[3]; - - /** - * sha256 - */ - - u32 w0_t = swap32 (w0_t2[0]); - u32 w1_t = swap32 (w0_t2[1]); - u32 w2_t = swap32 (w0_t2[2]); - u32 w3_t = swap32 (w0_t2[3]); - u32 w4_t = swap32 (w1_t2[0]); - u32 w5_t = swap32 (w1_t2[1]); - u32 w6_t = swap32 (w1_t2[2]); - u32 w7_t = swap32 (w1_t2[3]); - u32 w8_t = swap32 (w2_t2[0]); - u32 w9_t = swap32 (w2_t2[1]); - u32 wa_t = swap32 (w2_t2[2]); - u32 wb_t = swap32 (w2_t2[3]); - u32 wc_t = swap32 (w3_t2[0]); - u32 wd_t = swap32 (w3_t2[1]); - u32 we_t = 0; - u32 wf_t = pw_salt_len * 8; - - u32 a = SHA256M_A; - u32 b = SHA256M_B; - u32 c = SHA256M_C; - u32 d = SHA256M_D; - u32 e = SHA256M_E; - u32 f = SHA256M_F; - u32 g = SHA256M_G; - u32 h = SHA256M_H; + u32x a = SHA256M_A; + u32x b = SHA256M_B; + u32x c = SHA256M_C; + u32x d = SHA256M_D; + u32x e = SHA256M_E; + u32x f = SHA256M_F; + u32x g = SHA256M_G; + u32x h = SHA256M_H; SHA256_STEP (SHA256_F0o, SHA256_F1o, a, b, c, d, e, f, g, h, w0_t, SHA256C00); SHA256_STEP (SHA256_F0o, SHA256_F1o, h, a, b, c, d, e, f, g, w1_t, SHA256C01); @@ -438,17 +500,14 @@ static void m01440s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le wa_t = SHA256_EXPAND (w8_t, w3_t, wb_t, wa_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, g, h, a, b, c, d, e, f, wa_t, SHA256C3a); wb_t = SHA256_EXPAND (w9_t, w4_t, wc_t, wb_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, f, g, h, a, b, c, d, e, wb_t, SHA256C3b); wc_t = SHA256_EXPAND (wa_t, w5_t, wd_t, wc_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, e, f, g, h, a, b, c, d, wc_t, SHA256C3c); + + if (MATCHES_NONE_VS (d, search[0])) continue; + wd_t = SHA256_EXPAND (wb_t, w6_t, we_t, wd_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, d, e, f, g, h, a, b, c, wd_t, SHA256C3d); we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e); wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f); - - const u32 r0 = d; - const u32 r1 = h; - const u32 r2 = c; - const u32 r3 = g; - - #include COMPARE_S + COMPARE_S_SIMD (d, h, c, g); } } diff --git a/OpenCL/m01450_a1.cl b/OpenCL/m01450_a1.cl index 73cffdc97..e81668094 100644 --- a/OpenCL/m01450_a1.cl +++ b/OpenCL/m01450_a1.cl @@ -275,7 +275,7 @@ __kernel void m01450_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -338,7 +338,7 @@ __kernel void m01450_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -492,7 +492,7 @@ __kernel void m01450_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -567,7 +567,7 @@ __kernel void m01450_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m01450_a3.cl b/OpenCL/m01450_a3.cl index f5d143678..683f1dd95 100644 --- a/OpenCL/m01450_a3.cl +++ b/OpenCL/m01450_a3.cl @@ -5,6 +5,8 @@ #define _SHA256_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,9 +18,7 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" - -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" +#include "OpenCL/simd.c" __constant u32 k_sha256[64] = { @@ -40,33 +40,33 @@ __constant u32 k_sha256[64] = SHA256C3c, SHA256C3d, SHA256C3e, SHA256C3f, }; -static void sha256_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[8]) +static void sha256_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[8]) { - u32 a = digest[0]; - u32 b = digest[1]; - u32 c = digest[2]; - u32 d = digest[3]; - u32 e = digest[4]; - u32 f = digest[5]; - u32 g = digest[6]; - u32 h = digest[7]; + u32x a = digest[0]; + u32x b = digest[1]; + u32x c = digest[2]; + u32x d = digest[3]; + u32x e = digest[4]; + u32x f = digest[5]; + u32x g = digest[6]; + u32x h = digest[7]; - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; + u32x w0_t = w0[0]; + u32x w1_t = w0[1]; + u32x w2_t = w0[2]; + u32x w3_t = w0[3]; + u32x w4_t = w1[0]; + u32x w5_t = w1[1]; + u32x w6_t = w1[2]; + u32x w7_t = w1[3]; + u32x w8_t = w2[0]; + u32x w9_t = w2[1]; + u32x wa_t = w2[2]; + u32x wb_t = w2[3]; + u32x wc_t = w3[0]; + u32x wd_t = w3[1]; + u32x we_t = w3[2]; + u32x wf_t = w3[3]; #define ROUND_EXPAND() \ { \ @@ -126,7 +126,7 @@ static void sha256_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], digest[7] += h; } -static void hmac_sha256_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[8], u32 opad[8]) +static void hmac_sha256_pad (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[8], u32x opad[8]) { w0[0] = w0[0] ^ 0x36363636; w0[1] = w0[1] ^ 0x36363636; @@ -185,7 +185,7 @@ static void hmac_sha256_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipa sha256_transform (w0, w1, w2, w3, opad); } -static void hmac_sha256_run (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[8], u32 opad[8], u32 digest[8]) +static void hmac_sha256_run (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[8], u32x opad[8], u32x digest[8]) { digest[0] = ipad[0]; digest[1] = ipad[1]; @@ -262,46 +262,46 @@ static void m01450m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; /** * pads */ - u32 w0_t[4]; + u32x w0_t[4]; - w0_t[0] = w0[0]; + w0_t[0] = w0lr; w0_t[1] = w0[1]; w0_t[2] = w0[2]; w0_t[3] = w0[3]; - u32 w1_t[4]; + u32x w1_t[4]; w1_t[0] = w1[0]; w1_t[1] = w1[1]; w1_t[2] = w1[2]; w1_t[3] = w1[3]; - u32 w2_t[4]; + u32x w2_t[4]; w2_t[0] = w2[0]; w2_t[1] = w2[1]; w2_t[2] = w2[2]; w2_t[3] = w2[3]; - u32 w3_t[4]; + u32x w3_t[4]; w3_t[0] = w3[0]; w3_t[1] = w3[1]; w3_t[2] = 0; w3_t[3] = 0; - u32 ipad[8]; - u32 opad[8]; + u32x ipad[8]; + u32x opad[8]; hmac_sha256_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); @@ -322,16 +322,11 @@ static void m01450m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w3_t[2] = 0; w3_t[3] = (64 + salt_len) * 8; - u32 digest[8]; + u32x digest[8]; hmac_sha256_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); - const u32 r0 = digest[3]; - const u32 r1 = digest[7]; - const u32 r2 = digest[2]; - const u32 r3 = digest[6]; - - #include COMPARE_M + COMPARE_M_SIMD (digest[3], digest[7], digest[2], digest[6]); } } @@ -382,46 +377,46 @@ static void m01450s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; /** * pads */ - u32 w0_t[4]; + u32x w0_t[4]; - w0_t[0] = w0[0]; + w0_t[0] = w0lr; w0_t[1] = w0[1]; w0_t[2] = w0[2]; w0_t[3] = w0[3]; - u32 w1_t[4]; + u32x w1_t[4]; w1_t[0] = w1[0]; w1_t[1] = w1[1]; w1_t[2] = w1[2]; w1_t[3] = w1[3]; - u32 w2_t[4]; + u32x w2_t[4]; w2_t[0] = w2[0]; w2_t[1] = w2[1]; w2_t[2] = w2[2]; w2_t[3] = w2[3]; - u32 w3_t[4]; + u32x w3_t[4]; w3_t[0] = w3[0]; w3_t[1] = w3[1]; w3_t[2] = 0; w3_t[3] = 0; - u32 ipad[8]; - u32 opad[8]; + u32x ipad[8]; + u32x opad[8]; hmac_sha256_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); @@ -442,16 +437,11 @@ static void m01450s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w3_t[2] = 0; w3_t[3] = (64 + salt_len) * 8; - u32 digest[8]; + u32x digest[8]; hmac_sha256_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); - const u32 r0 = digest[3]; - const u32 r1 = digest[7]; - const u32 r2 = digest[2]; - const u32 r3 = digest[6]; - - #include COMPARE_S + COMPARE_S_SIMD (digest[3], digest[7], digest[2], digest[6]); } } diff --git a/OpenCL/m01460_a1.cl b/OpenCL/m01460_a1.cl index 8d1697a98..2e83a07af 100644 --- a/OpenCL/m01460_a1.cl +++ b/OpenCL/m01460_a1.cl @@ -275,7 +275,7 @@ __kernel void m01460_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -373,7 +373,7 @@ __kernel void m01460_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -492,7 +492,7 @@ __kernel void m01460_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -602,7 +602,7 @@ __kernel void m01460_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m01460_a3.cl b/OpenCL/m01460_a3.cl index 357c31853..d2406572f 100644 --- a/OpenCL/m01460_a3.cl +++ b/OpenCL/m01460_a3.cl @@ -5,6 +5,8 @@ #define _SHA256_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,9 +18,7 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" - -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" +#include "OpenCL/simd.c" __constant u32 k_sha256[64] = { @@ -40,33 +40,33 @@ __constant u32 k_sha256[64] = SHA256C3c, SHA256C3d, SHA256C3e, SHA256C3f, }; -static void sha256_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[8]) +static void sha256_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[8]) { - u32 a = digest[0]; - u32 b = digest[1]; - u32 c = digest[2]; - u32 d = digest[3]; - u32 e = digest[4]; - u32 f = digest[5]; - u32 g = digest[6]; - u32 h = digest[7]; + u32x a = digest[0]; + u32x b = digest[1]; + u32x c = digest[2]; + u32x d = digest[3]; + u32x e = digest[4]; + u32x f = digest[5]; + u32x g = digest[6]; + u32x h = digest[7]; - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; + u32x w0_t = w0[0]; + u32x w1_t = w0[1]; + u32x w2_t = w0[2]; + u32x w3_t = w0[3]; + u32x w4_t = w1[0]; + u32x w5_t = w1[1]; + u32x w6_t = w1[2]; + u32x w7_t = w1[3]; + u32x w8_t = w2[0]; + u32x w9_t = w2[1]; + u32x wa_t = w2[2]; + u32x wb_t = w2[3]; + u32x wc_t = w3[0]; + u32x wd_t = w3[1]; + u32x we_t = w3[2]; + u32x wf_t = w3[3]; #define ROUND_EXPAND() \ { \ @@ -126,7 +126,7 @@ static void sha256_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], digest[7] += h; } -static void hmac_sha256_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[8], u32 opad[8]) +static void hmac_sha256_pad (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[8], u32x opad[8]) { w0[0] = w0[0] ^ 0x36363636; w0[1] = w0[1] ^ 0x36363636; @@ -185,7 +185,7 @@ static void hmac_sha256_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipa sha256_transform (w0, w1, w2, w3, opad); } -static void hmac_sha256_run (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[8], u32 opad[8], u32 digest[8]) +static void hmac_sha256_run (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[8], u32x opad[8], u32x digest[8]) { digest[0] = ipad[0]; digest[1] = ipad[1]; @@ -258,36 +258,36 @@ static void m01460m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * pads */ - u32 w0_t[4]; + u32x w0_t[4]; w0_t[0] = swap32 (salt_buf0[0]); w0_t[1] = swap32 (salt_buf0[1]); w0_t[2] = swap32 (salt_buf0[2]); w0_t[3] = swap32 (salt_buf0[3]); - u32 w1_t[4]; + u32x w1_t[4]; w1_t[0] = swap32 (salt_buf1[0]); w1_t[1] = swap32 (salt_buf1[1]); w1_t[2] = swap32 (salt_buf1[2]); w1_t[3] = swap32 (salt_buf1[3]); - u32 w2_t[4]; + u32x w2_t[4]; w2_t[0] = 0; w2_t[1] = 0; w2_t[2] = 0; w2_t[3] = 0; - u32 w3_t[4]; + u32x w3_t[4]; w3_t[0] = 0; w3_t[1] = 0; w3_t[2] = 0; w3_t[3] = 0; - u32 ipad[8]; - u32 opad[8]; + u32x ipad[8]; + u32x opad[8]; hmac_sha256_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); @@ -297,13 +297,13 @@ static void m01460m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - w0_t[0] = w0[0]; + w0_t[0] = w0lr; w0_t[1] = w0[1]; w0_t[2] = w0[2]; w0_t[3] = w0[3]; @@ -320,16 +320,11 @@ static void m01460m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w3_t[2] = 0; w3_t[3] = (64 + pw_len) * 8; - u32 digest[8]; + u32x digest[8]; hmac_sha256_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); - const u32 r0 = digest[3]; - const u32 r1 = digest[7]; - const u32 r2 = digest[2]; - const u32 r3 = digest[6]; - - #include COMPARE_M + COMPARE_M_SIMD (digest[3], digest[7], digest[2], digest[6]); } } @@ -364,36 +359,36 @@ static void m01460s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * pads */ - u32 w0_t[4]; + u32x w0_t[4]; w0_t[0] = swap32 (salt_buf0[0]); w0_t[1] = swap32 (salt_buf0[1]); w0_t[2] = swap32 (salt_buf0[2]); w0_t[3] = swap32 (salt_buf0[3]); - u32 w1_t[4]; + u32x w1_t[4]; w1_t[0] = swap32 (salt_buf1[0]); w1_t[1] = swap32 (salt_buf1[1]); w1_t[2] = swap32 (salt_buf1[2]); w1_t[3] = swap32 (salt_buf1[3]); - u32 w2_t[4]; + u32x w2_t[4]; w2_t[0] = 0; w2_t[1] = 0; w2_t[2] = 0; w2_t[3] = 0; - u32 w3_t[4]; + u32x w3_t[4]; w3_t[0] = 0; w3_t[1] = 0; w3_t[2] = 0; w3_t[3] = 0; - u32 ipad[8]; - u32 opad[8]; + u32x ipad[8]; + u32x opad[8]; hmac_sha256_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); @@ -415,13 +410,13 @@ static void m01460s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - w0_t[0] = w0[0]; + w0_t[0] = w0lr; w0_t[1] = w0[1]; w0_t[2] = w0[2]; w0_t[3] = w0[3]; @@ -438,16 +433,11 @@ static void m01460s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w3_t[2] = 0; w3_t[3] = (64 + pw_len) * 8; - u32 digest[8]; + u32x digest[8]; hmac_sha256_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); - const u32 r0 = digest[3]; - const u32 r1 = digest[7]; - const u32 r2 = digest[2]; - const u32 r3 = digest[6]; - - #include COMPARE_S + COMPARE_S_SIMD (digest[3], digest[7], digest[2], digest[6]); } } diff --git a/OpenCL/m01500_a1.cl b/OpenCL/m01500_a1.cl index 9cc943812..aa98960a8 100644 --- a/OpenCL/m01500_a1.cl +++ b/OpenCL/m01500_a1.cl @@ -518,7 +518,7 @@ __kernel void m01500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -601,7 +601,7 @@ __kernel void m01500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -709,7 +709,7 @@ __kernel void m01500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -804,7 +804,7 @@ __kernel void m01500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m01700_a1.cl b/OpenCL/m01700_a1.cl index 0f1a8914f..61c654c7a 100644 --- a/OpenCL/m01700_a1.cl +++ b/OpenCL/m01700_a1.cl @@ -191,7 +191,7 @@ __kernel void m01700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -236,7 +236,7 @@ __kernel void m01700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordr0, wordr1, pw_r_len); - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -368,7 +368,7 @@ __kernel void m01700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -425,7 +425,7 @@ __kernel void m01700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordr0, wordr1, pw_r_len); - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m01700_a3.cl b/OpenCL/m01700_a3.cl index 98baf3842..24ddcaa54 100644 --- a/OpenCL/m01700_a3.cl +++ b/OpenCL/m01700_a3.cl @@ -5,6 +5,8 @@ #define _SHA512_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,9 +18,7 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" - -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" +#include "OpenCL/simd.c" __constant u64 k_sha512[80] = { @@ -44,33 +44,33 @@ __constant u64 k_sha512[80] = SHA512C4c, SHA512C4d, SHA512C4e, SHA512C4f, }; -static void sha512_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u64 digest[8]) +static void sha512_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u64x digest[8]) { - u64 w0_t = hl32_to_64 (w0[0], w0[1]); - u64 w1_t = hl32_to_64 (w0[2], w0[3]); - u64 w2_t = hl32_to_64 (w1[0], w1[1]); - u64 w3_t = hl32_to_64 (w1[2], w1[3]); - u64 w4_t = hl32_to_64 (w2[0], w2[1]); - u64 w5_t = hl32_to_64 (w2[2], w2[3]); - u64 w6_t = hl32_to_64 (w3[0], w3[1]); - u64 w7_t = 0; - u64 w8_t = 0; - u64 w9_t = 0; - u64 wa_t = 0; - u64 wb_t = 0; - u64 wc_t = 0; - u64 wd_t = 0; - u64 we_t = 0; - u64 wf_t = hl32_to_64 (w3[2], w3[3]); + u64x w0_t = hl32_to_64 (w0[0], w0[1]); + u64x w1_t = hl32_to_64 (w0[2], w0[3]); + u64x w2_t = hl32_to_64 (w1[0], w1[1]); + u64x w3_t = hl32_to_64 (w1[2], w1[3]); + u64x w4_t = hl32_to_64 (w2[0], w2[1]); + u64x w5_t = hl32_to_64 (w2[2], w2[3]); + u64x w6_t = hl32_to_64 (w3[0], w3[1]); + u64x w7_t = 0; + u64x w8_t = 0; + u64x w9_t = 0; + u64x wa_t = 0; + u64x wb_t = 0; + u64x wc_t = 0; + u64x wd_t = 0; + u64x we_t = 0; + u64x wf_t = hl32_to_64 (w3[2], w3[3]); - u64 a = digest[0]; - u64 b = digest[1]; - u64 c = digest[2]; - u64 d = digest[3]; - u64 e = digest[4]; - u64 f = digest[5]; - u64 g = digest[6]; - u64 h = digest[7]; + u64x a = digest[0]; + u64x b = digest[1]; + u64x c = digest[2]; + u64x d = digest[3]; + u64x e = digest[4]; + u64x f = digest[5]; + u64x g = digest[6]; + u64x h = digest[7]; #define ROUND_EXPAND() \ { \ @@ -141,7 +141,7 @@ static void sha512_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], digest[7] = h; } -static void m01700m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m01700m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -156,16 +156,16 @@ static void m01700m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - u32 w0_t[4]; - u32 w1_t[4]; - u32 w2_t[4]; - u32 w3_t[4]; + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; w0_t[0] = w0; w0_t[1] = w[ 1]; @@ -184,7 +184,7 @@ static void m01700m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k w3_t[2] = w[14]; w3_t[3] = w[15]; - u64 digest[8]; + u64x digest[8]; digest[0] = SHA512M_A; digest[1] = SHA512M_B; @@ -197,17 +197,16 @@ static void m01700m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k sha512_transform (w0_t, w1_t, w2_t, w3_t, digest); + const u32x r0 = l32_from_64 (digest[7]); + const u32x r1 = h32_from_64 (digest[7]); + const u32x r2 = l32_from_64 (digest[3]); + const u32x r3 = h32_from_64 (digest[3]); - const u32 r0 = l32_from_64 (digest[7]); - const u32 r1 = h32_from_64 (digest[7]); - const u32 r2 = l32_from_64 (digest[3]); - const u32 r3 = h32_from_64 (digest[3]); - - #include COMPARE_M + COMPARE_M_SIMD (r0, r1, r2, r3); } } -static void m01700s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m01700s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -234,16 +233,16 @@ static void m01700s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - u32 w0_t[4]; - u32 w1_t[4]; - u32 w2_t[4]; - u32 w3_t[4]; + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; w0_t[0] = w0; w0_t[1] = w[ 1]; @@ -262,7 +261,7 @@ static void m01700s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k w3_t[2] = w[14]; w3_t[3] = w[15]; - u64 digest[8]; + u64x digest[8]; digest[0] = SHA512M_A; digest[1] = SHA512M_B; @@ -275,17 +274,16 @@ static void m01700s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k sha512_transform (w0_t, w1_t, w2_t, w3_t, digest); + const u32x r0 = l32_from_64 (digest[7]); + const u32x r1 = h32_from_64 (digest[7]); + const u32x r2 = l32_from_64 (digest[3]); + const u32x r3 = h32_from_64 (digest[3]); - const u32 r0 = l32_from_64 (digest[7]); - const u32 r1 = h32_from_64 (digest[7]); - const u32 r2 = l32_from_64 (digest[3]); - const u32 r3 = h32_from_64 (digest[3]); - - #include COMPARE_S + COMPARE_S_SIMD (r0, r1, r2, r3); } } -__kernel void m01700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m01700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -323,7 +321,7 @@ __kernel void m01700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m01700m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m01700_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m01700_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -361,7 +359,7 @@ __kernel void m01700_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m01700m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m01700_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m01700_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -399,7 +397,7 @@ __kernel void m01700_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m01700m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m01700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m01700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -437,7 +435,7 @@ __kernel void m01700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m01700s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m01700_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m01700_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -475,7 +473,7 @@ __kernel void m01700_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m01700s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m01700_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m01700_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base diff --git a/OpenCL/m01710_a0.cl b/OpenCL/m01710_a0.cl index 472209838..a2c28be22 100644 --- a/OpenCL/m01710_a0.cl +++ b/OpenCL/m01710_a0.cl @@ -263,7 +263,7 @@ __kernel void m01710_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, out_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, out_len); const u32 out_salt_len = out_len + salt_len; @@ -478,7 +478,7 @@ __kernel void m01710_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, out_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, out_len); const u32 out_salt_len = out_len + salt_len; diff --git a/OpenCL/m01710_a1.cl b/OpenCL/m01710_a1.cl index 442dc597d..85d481cce 100644 --- a/OpenCL/m01710_a1.cl +++ b/OpenCL/m01710_a1.cl @@ -189,7 +189,7 @@ __kernel void m01710_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -252,7 +252,7 @@ __kernel void m01710_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } /** @@ -287,7 +287,7 @@ __kernel void m01710_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, pw_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len); const u32 pw_salt_len = pw_len + salt_len; @@ -420,7 +420,7 @@ __kernel void m01710_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -495,7 +495,7 @@ __kernel void m01710_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } /** @@ -530,7 +530,7 @@ __kernel void m01710_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, pw_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len); const u32 pw_salt_len = pw_len + salt_len; diff --git a/OpenCL/m01710_a3.cl b/OpenCL/m01710_a3.cl index d02a158c6..7ec0b43fb 100644 --- a/OpenCL/m01710_a3.cl +++ b/OpenCL/m01710_a3.cl @@ -5,6 +5,8 @@ #define _SHA512_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,9 +18,7 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" - -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" +#include "OpenCL/simd.c" __constant u64 k_sha512[80] = { @@ -44,33 +44,33 @@ __constant u64 k_sha512[80] = SHA512C4c, SHA512C4d, SHA512C4e, SHA512C4f, }; -static void sha512_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u64 digest[8]) +static void sha512_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u64x digest[8]) { - u64 w0_t = hl32_to_64 (w0[0], w0[1]); - u64 w1_t = hl32_to_64 (w0[2], w0[3]); - u64 w2_t = hl32_to_64 (w1[0], w1[1]); - u64 w3_t = hl32_to_64 (w1[2], w1[3]); - u64 w4_t = hl32_to_64 (w2[0], w2[1]); - u64 w5_t = hl32_to_64 (w2[2], w2[3]); - u64 w6_t = hl32_to_64 (w3[0], w3[1]); - u64 w7_t = 0; - u64 w8_t = 0; - u64 w9_t = 0; - u64 wa_t = 0; - u64 wb_t = 0; - u64 wc_t = 0; - u64 wd_t = 0; - u64 we_t = 0; - u64 wf_t = hl32_to_64 (w3[2], w3[3]); + u64x w0_t = hl32_to_64 (w0[0], w0[1]); + u64x w1_t = hl32_to_64 (w0[2], w0[3]); + u64x w2_t = hl32_to_64 (w1[0], w1[1]); + u64x w3_t = hl32_to_64 (w1[2], w1[3]); + u64x w4_t = hl32_to_64 (w2[0], w2[1]); + u64x w5_t = hl32_to_64 (w2[2], w2[3]); + u64x w6_t = hl32_to_64 (w3[0], w3[1]); + u64x w7_t = 0; + u64x w8_t = 0; + u64x w9_t = 0; + u64x wa_t = 0; + u64x wb_t = 0; + u64x wc_t = 0; + u64x wd_t = 0; + u64x we_t = 0; + u64x wf_t = hl32_to_64 (w3[2], w3[3]); - u64 a = digest[0]; - u64 b = digest[1]; - u64 c = digest[2]; - u64 d = digest[3]; - u64 e = digest[4]; - u64 f = digest[5]; - u64 g = digest[6]; - u64 h = digest[7]; + u64x a = digest[0]; + u64x b = digest[1]; + u64x c = digest[2]; + u64x d = digest[3]; + u64x e = digest[4]; + u64x f = digest[5]; + u64x g = digest[6]; + u64x h = digest[7]; #define ROUND_EXPAND() \ { \ @@ -141,7 +141,7 @@ static void sha512_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], digest[7] = h; } -static void m01710m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m01710m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -182,24 +182,24 @@ static void m01710m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k salt_buf3[2] = 0; salt_buf3[3] = 0; - switch_buffer_by_offset (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len); + switch_buffer_by_offset_le_S (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len); - w[ 0] |= swap32 (salt_buf0[0]); - w[ 1] |= swap32 (salt_buf0[1]); - w[ 2] |= swap32 (salt_buf0[2]); - w[ 3] |= swap32 (salt_buf0[3]); - w[ 4] |= swap32 (salt_buf1[0]); - w[ 5] |= swap32 (salt_buf1[1]); - w[ 6] |= swap32 (salt_buf1[2]); - w[ 7] |= swap32 (salt_buf1[3]); - w[ 8] |= swap32 (salt_buf2[0]); - w[ 9] |= swap32 (salt_buf2[1]); - w[10] |= swap32 (salt_buf2[2]); - w[11] |= swap32 (salt_buf2[3]); - w[12] |= swap32 (salt_buf3[0]); - w[13] |= swap32 (salt_buf3[1]); - w[14] |= swap32 (salt_buf3[2]); - w[15] |= swap32 (salt_buf3[3]); + w[ 0] |= swap32_S (salt_buf0[0]); + w[ 1] |= swap32_S (salt_buf0[1]); + w[ 2] |= swap32_S (salt_buf0[2]); + w[ 3] |= swap32_S (salt_buf0[3]); + w[ 4] |= swap32_S (salt_buf1[0]); + w[ 5] |= swap32_S (salt_buf1[1]); + w[ 6] |= swap32_S (salt_buf1[2]); + w[ 7] |= swap32_S (salt_buf1[3]); + w[ 8] |= swap32_S (salt_buf2[0]); + w[ 9] |= swap32_S (salt_buf2[1]); + w[10] |= swap32_S (salt_buf2[2]); + w[11] |= swap32_S (salt_buf2[3]); + w[12] |= swap32_S (salt_buf3[0]); + w[13] |= swap32_S (salt_buf3[1]); + w[14] |= swap32_S (salt_buf3[2]); + w[15] |= swap32_S (salt_buf3[3]); const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -213,16 +213,16 @@ static void m01710m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - u32 w0_t[4]; - u32 w1_t[4]; - u32 w2_t[4]; - u32 w3_t[4]; + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; w0_t[0] = w0; w0_t[1] = w[ 1]; @@ -241,7 +241,7 @@ static void m01710m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k w3_t[2] = w[14]; w3_t[3] = w[15]; - u64 digest[8]; + u64x digest[8]; digest[0] = SHA512M_A; digest[1] = SHA512M_B; @@ -254,17 +254,16 @@ static void m01710m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k sha512_transform (w0_t, w1_t, w2_t, w3_t, digest); + const u32x r0 = l32_from_64 (digest[7]); + const u32x r1 = h32_from_64 (digest[7]); + const u32x r2 = l32_from_64 (digest[3]); + const u32x r3 = h32_from_64 (digest[3]); - const u32 r0 = l32_from_64 (digest[7]); - const u32 r1 = h32_from_64 (digest[7]); - const u32 r2 = l32_from_64 (digest[3]); - const u32 r3 = h32_from_64 (digest[3]); - - #include COMPARE_M + COMPARE_M_SIMD (r0, r1, r2, r3); } } -static void m01710s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m01710s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -291,16 +290,16 @@ static void m01710s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - u32 w0_t[4]; - u32 w1_t[4]; - u32 w2_t[4]; - u32 w3_t[4]; + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; w0_t[0] = w0; w0_t[1] = w[ 1]; @@ -319,7 +318,7 @@ static void m01710s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k w3_t[2] = w[14]; w3_t[3] = w[15]; - u64 digest[8]; + u64x digest[8]; digest[0] = SHA512M_A; digest[1] = SHA512M_B; @@ -332,17 +331,16 @@ static void m01710s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k sha512_transform (w0_t, w1_t, w2_t, w3_t, digest); + const u32x r0 = l32_from_64 (digest[7]); + const u32x r1 = h32_from_64 (digest[7]); + const u32x r2 = l32_from_64 (digest[3]); + const u32x r3 = h32_from_64 (digest[3]); - const u32 r0 = l32_from_64 (digest[7]); - const u32 r1 = h32_from_64 (digest[7]); - const u32 r2 = l32_from_64 (digest[3]); - const u32 r3 = h32_from_64 (digest[3]); - - #include COMPARE_S + COMPARE_S_SIMD (r0, r1, r2, r3); } } -__kernel void m01710_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m01710_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -380,7 +378,7 @@ __kernel void m01710_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m01710m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m01710_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m01710_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -418,7 +416,7 @@ __kernel void m01710_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m01710m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m01710_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m01710_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -456,7 +454,7 @@ __kernel void m01710_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m01710m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m01710_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m01710_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -494,7 +492,7 @@ __kernel void m01710_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m01710s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m01710_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m01710_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -532,7 +530,7 @@ __kernel void m01710_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m01710s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m01710_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m01710_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base diff --git a/OpenCL/m01720_a0.cl b/OpenCL/m01720_a0.cl index d0466153b..0b5e7d0e9 100644 --- a/OpenCL/m01720_a0.cl +++ b/OpenCL/m01720_a0.cl @@ -237,7 +237,7 @@ __kernel void m01720_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, const u32 out_salt_len = out_len + salt_len; - switch_buffer_by_offset (w0, w1, w2, w3, salt_len); + switch_buffer_by_offset_le (w0, w1, w2, w3, salt_len); w0[0] |= salt_buf0[0]; w0[1] |= salt_buf0[1]; @@ -413,7 +413,7 @@ __kernel void m01720_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, const u32 out_salt_len = out_len + salt_len; - switch_buffer_by_offset (w0, w1, w2, w3, salt_len); + switch_buffer_by_offset_le (w0, w1, w2, w3, salt_len); w0[0] |= salt_buf0[0]; w0[1] |= salt_buf0[1]; diff --git a/OpenCL/m01720_a1.cl b/OpenCL/m01720_a1.cl index 0664f8dcb..f3600f2c3 100644 --- a/OpenCL/m01720_a1.cl +++ b/OpenCL/m01720_a1.cl @@ -189,7 +189,7 @@ __kernel void m01720_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -246,7 +246,7 @@ __kernel void m01720_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -277,7 +277,7 @@ __kernel void m01720_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, const u32 pw_salt_len = pw_len + salt_len; - switch_buffer_by_offset (w0, w1, w2, w3, salt_len); + switch_buffer_by_offset_le (w0, w1, w2, w3, salt_len); w0[0] |= salt_buf0[0]; w0[1] |= salt_buf0[1]; @@ -395,7 +395,7 @@ __kernel void m01720_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -464,7 +464,7 @@ __kernel void m01720_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -495,7 +495,7 @@ __kernel void m01720_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, const u32 pw_salt_len = pw_len + salt_len; - switch_buffer_by_offset (w0, w1, w2, w3, salt_len); + switch_buffer_by_offset_le (w0, w1, w2, w3, salt_len); w0[0] |= salt_buf0[0]; w0[1] |= salt_buf0[1]; diff --git a/OpenCL/m01720_a3.cl b/OpenCL/m01720_a3.cl index 8da606934..cd667943a 100644 --- a/OpenCL/m01720_a3.cl +++ b/OpenCL/m01720_a3.cl @@ -5,6 +5,8 @@ #define _SHA512_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,9 +18,7 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" - -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" +#include "OpenCL/simd.c" __constant u64 k_sha512[80] = { @@ -44,33 +44,33 @@ __constant u64 k_sha512[80] = SHA512C4c, SHA512C4d, SHA512C4e, SHA512C4f, }; -static void sha512_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u64 digest[8]) +static void sha512_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u64x digest[8]) { - u64 w0_t = hl32_to_64 (w0[0], w0[1]); - u64 w1_t = hl32_to_64 (w0[2], w0[3]); - u64 w2_t = hl32_to_64 (w1[0], w1[1]); - u64 w3_t = hl32_to_64 (w1[2], w1[3]); - u64 w4_t = hl32_to_64 (w2[0], w2[1]); - u64 w5_t = hl32_to_64 (w2[2], w2[3]); - u64 w6_t = hl32_to_64 (w3[0], w3[1]); - u64 w7_t = 0; - u64 w8_t = 0; - u64 w9_t = 0; - u64 wa_t = 0; - u64 wb_t = 0; - u64 wc_t = 0; - u64 wd_t = 0; - u64 we_t = 0; - u64 wf_t = hl32_to_64 (w3[2], w3[3]); + u64x w0_t = hl32_to_64 (w0[0], w0[1]); + u64x w1_t = hl32_to_64 (w0[2], w0[3]); + u64x w2_t = hl32_to_64 (w1[0], w1[1]); + u64x w3_t = hl32_to_64 (w1[2], w1[3]); + u64x w4_t = hl32_to_64 (w2[0], w2[1]); + u64x w5_t = hl32_to_64 (w2[2], w2[3]); + u64x w6_t = hl32_to_64 (w3[0], w3[1]); + u64x w7_t = 0; + u64x w8_t = 0; + u64x w9_t = 0; + u64x wa_t = 0; + u64x wb_t = 0; + u64x wc_t = 0; + u64x wd_t = 0; + u64x we_t = 0; + u64x wf_t = hl32_to_64 (w3[2], w3[3]); - u64 a = digest[0]; - u64 b = digest[1]; - u64 c = digest[2]; - u64 d = digest[3]; - u64 e = digest[4]; - u64 f = digest[5]; - u64 g = digest[6]; - u64 h = digest[7]; + u64x a = digest[0]; + u64x b = digest[1]; + u64x c = digest[2]; + u64x d = digest[3]; + u64x e = digest[4]; + u64x f = digest[5]; + u64x g = digest[6]; + u64x h = digest[7]; #define ROUND_EXPAND() \ { \ @@ -186,85 +186,129 @@ static void m01720m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le const u32 pw_salt_len = pw_len + salt_len; + + /** + * prepend salt + */ + + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; + + w0_t[0] = swap32_S (w0[0]); + w0_t[1] = swap32_S (w0[1]); + w0_t[2] = swap32_S (w0[2]); + w0_t[3] = swap32_S (w0[3]); + w1_t[0] = swap32_S (w1[0]); + w1_t[1] = swap32_S (w1[1]); + w1_t[2] = swap32_S (w1[2]); + w1_t[3] = swap32_S (w1[3]); + w2_t[0] = swap32_S (w2[0]); + w2_t[1] = swap32_S (w2[1]); + w2_t[2] = swap32_S (w2[2]); + w2_t[3] = swap32_S (w2[3]); + w3_t[0] = swap32_S (w3[0]); + w3_t[1] = swap32_S (w3[1]); + w3_t[2] = swap32_S (w3[2]); + w3_t[3] = swap32_S (w3[3]); + + switch_buffer_by_offset_le_S (w0_t, w1_t, w2_t, w3_t, salt_len); + + w0_t[0] |= salt_buf0[0]; + w0_t[1] |= salt_buf0[1]; + w0_t[2] |= salt_buf0[2]; + w0_t[3] |= salt_buf0[3]; + w1_t[0] |= salt_buf1[0]; + w1_t[1] |= salt_buf1[1]; + w1_t[2] |= salt_buf1[2]; + w1_t[3] |= salt_buf1[3]; + w2_t[0] |= salt_buf2[0]; + w2_t[1] |= salt_buf2[1]; + w2_t[2] |= salt_buf2[2]; + w2_t[3] |= salt_buf2[3]; + w3_t[0] |= salt_buf3[0]; + w3_t[1] |= salt_buf3[1]; + w3_t[2] |= salt_buf3[2]; + w3_t[3] |= salt_buf3[3]; + + w0_t[0] = swap32_S (w0_t[0]); + w0_t[1] = swap32_S (w0_t[1]); + w0_t[2] = swap32_S (w0_t[2]); + w0_t[3] = swap32_S (w0_t[3]); + w1_t[0] = swap32_S (w1_t[0]); + w1_t[1] = swap32_S (w1_t[1]); + w1_t[2] = swap32_S (w1_t[2]); + w1_t[3] = swap32_S (w1_t[3]); + w2_t[0] = swap32_S (w2_t[0]); + w2_t[1] = swap32_S (w2_t[1]); + w2_t[2] = swap32_S (w2_t[2]); + w2_t[3] = swap32_S (w2_t[3]); + w3_t[0] = swap32_S (w3_t[0]); + w3_t[1] = swap32_S (w3_t[1]); + w3_t[2] = swap32_S (w3_t[2]); + w3_t[3] = swap32_S (w3_t[3]); + /** * loop */ u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - /** - * prepend salt - */ + u32x wx[16]; - u32 w0_t[4]; - u32 w1_t[4]; - u32 w2_t[4]; - u32 w3_t[4]; + wx[ 0] = w0_t[0]; + wx[ 1] = w0_t[1]; + wx[ 2] = w0_t[2]; + wx[ 3] = w0_t[3]; + wx[ 4] = w1_t[0]; + wx[ 5] = w1_t[1]; + wx[ 6] = w1_t[2]; + wx[ 7] = w1_t[3]; + wx[ 8] = w2_t[0]; + wx[ 9] = w2_t[1]; + wx[10] = w2_t[2]; + wx[11] = w2_t[3]; + wx[12] = w3_t[0]; + wx[13] = w3_t[1]; + wx[14] = w3_t[2]; + wx[15] = w3_t[3]; - w0_t[0] = swap32 (w0[0]); - w0_t[1] = swap32 (w0[1]); - w0_t[2] = swap32 (w0[2]); - w0_t[3] = swap32 (w0[3]); - w1_t[0] = swap32 (w1[0]); - w1_t[1] = swap32 (w1[1]); - w1_t[2] = swap32 (w1[2]); - w1_t[3] = swap32 (w1[3]); - w2_t[0] = swap32 (w2[0]); - w2_t[1] = swap32 (w2[1]); - w2_t[2] = swap32 (w2[2]); - w2_t[3] = swap32 (w2[3]); - w3_t[0] = swap32 (w3[0]); - w3_t[1] = swap32 (w3[1]); - w3_t[2] = swap32 (w3[2]); - w3_t[3] = swap32 (w3[3]); + overwrite_at_be (wx, w0lr, salt_len); - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; - w0_t[0] |= salt_buf0[0]; - w0_t[1] |= salt_buf0[1]; - w0_t[2] |= salt_buf0[2]; - w0_t[3] |= salt_buf0[3]; - w1_t[0] |= salt_buf1[0]; - w1_t[1] |= salt_buf1[1]; - w1_t[2] |= salt_buf1[2]; - w1_t[3] |= salt_buf1[3]; - w2_t[0] |= salt_buf2[0]; - w2_t[1] |= salt_buf2[1]; - w2_t[2] |= salt_buf2[2]; - w2_t[3] |= salt_buf2[3]; - w3_t[0] |= salt_buf3[0]; - w3_t[1] |= salt_buf3[1]; - w3_t[2] = 0; - w3_t[3] = pw_salt_len * 8; + w0_t[0] = wx[ 0]; + w0_t[1] = wx[ 1]; + w0_t[2] = wx[ 2]; + w0_t[3] = wx[ 3]; + w1_t[0] = wx[ 4]; + w1_t[1] = wx[ 5]; + w1_t[2] = wx[ 6]; + w1_t[3] = wx[ 7]; + w2_t[0] = wx[ 8]; + w2_t[1] = wx[ 9]; + w2_t[2] = wx[10]; + w2_t[3] = wx[11]; + w3_t[0] = wx[12]; + w3_t[1] = wx[13]; + w3_t[2] = 0; + w3_t[3] = pw_salt_len * 8; /** * sha512 */ - w0_t[0] = swap32 (w0_t[0]); - w0_t[1] = swap32 (w0_t[1]); - w0_t[2] = swap32 (w0_t[2]); - w0_t[3] = swap32 (w0_t[3]); - w1_t[0] = swap32 (w1_t[0]); - w1_t[1] = swap32 (w1_t[1]); - w1_t[2] = swap32 (w1_t[2]); - w1_t[3] = swap32 (w1_t[3]); - w2_t[0] = swap32 (w2_t[0]); - w2_t[1] = swap32 (w2_t[1]); - w2_t[2] = swap32 (w2_t[2]); - w2_t[3] = swap32 (w2_t[3]); - w3_t[0] = swap32 (w3_t[0]); - w3_t[1] = swap32 (w3_t[1]); - //w3_t[2] = swap32 (w3_t[2]); - //w3_t[3] = swap32 (w3_t[3]); - - u64 digest[8]; + u64x digest[8]; digest[0] = SHA512M_A; digest[1] = SHA512M_B; @@ -277,13 +321,12 @@ static void m01720m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le sha512_transform (w0_t, w1_t, w2_t, w3_t, digest); + const u32x r0 = l32_from_64 (digest[7]); + const u32x r1 = h32_from_64 (digest[7]); + const u32x r2 = l32_from_64 (digest[3]); + const u32x r3 = h32_from_64 (digest[3]); - const u32 r0 = l32_from_64 (digest[7]); - const u32 r1 = h32_from_64 (digest[7]); - const u32 r2 = l32_from_64 (digest[3]); - const u32 r3 = h32_from_64 (digest[3]); - - #include COMPARE_M + COMPARE_M_SIMD (r0, r1, r2, r3); } } @@ -344,85 +387,128 @@ static void m01720s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le const u32 pw_salt_len = pw_len + salt_len; + /** + * prepend salt + */ + + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; + + w0_t[0] = swap32_S (w0[0]); + w0_t[1] = swap32_S (w0[1]); + w0_t[2] = swap32_S (w0[2]); + w0_t[3] = swap32_S (w0[3]); + w1_t[0] = swap32_S (w1[0]); + w1_t[1] = swap32_S (w1[1]); + w1_t[2] = swap32_S (w1[2]); + w1_t[3] = swap32_S (w1[3]); + w2_t[0] = swap32_S (w2[0]); + w2_t[1] = swap32_S (w2[1]); + w2_t[2] = swap32_S (w2[2]); + w2_t[3] = swap32_S (w2[3]); + w3_t[0] = swap32_S (w3[0]); + w3_t[1] = swap32_S (w3[1]); + w3_t[2] = swap32_S (w3[2]); + w3_t[3] = swap32_S (w3[3]); + + switch_buffer_by_offset_le_S (w0_t, w1_t, w2_t, w3_t, salt_len); + + w0_t[0] |= salt_buf0[0]; + w0_t[1] |= salt_buf0[1]; + w0_t[2] |= salt_buf0[2]; + w0_t[3] |= salt_buf0[3]; + w1_t[0] |= salt_buf1[0]; + w1_t[1] |= salt_buf1[1]; + w1_t[2] |= salt_buf1[2]; + w1_t[3] |= salt_buf1[3]; + w2_t[0] |= salt_buf2[0]; + w2_t[1] |= salt_buf2[1]; + w2_t[2] |= salt_buf2[2]; + w2_t[3] |= salt_buf2[3]; + w3_t[0] |= salt_buf3[0]; + w3_t[1] |= salt_buf3[1]; + w3_t[2] |= salt_buf3[2]; + w3_t[3] |= salt_buf3[3]; + + w0_t[0] = swap32_S (w0_t[0]); + w0_t[1] = swap32_S (w0_t[1]); + w0_t[2] = swap32_S (w0_t[2]); + w0_t[3] = swap32_S (w0_t[3]); + w1_t[0] = swap32_S (w1_t[0]); + w1_t[1] = swap32_S (w1_t[1]); + w1_t[2] = swap32_S (w1_t[2]); + w1_t[3] = swap32_S (w1_t[3]); + w2_t[0] = swap32_S (w2_t[0]); + w2_t[1] = swap32_S (w2_t[1]); + w2_t[2] = swap32_S (w2_t[2]); + w2_t[3] = swap32_S (w2_t[3]); + w3_t[0] = swap32_S (w3_t[0]); + w3_t[1] = swap32_S (w3_t[1]); + w3_t[2] = swap32_S (w3_t[2]); + w3_t[3] = swap32_S (w3_t[3]); + /** * loop */ u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; /** * prepend salt */ - u32 w0_t[4]; - u32 w1_t[4]; - u32 w2_t[4]; - u32 w3_t[4]; + u32x wx[16]; - w0_t[0] = swap32 (w0[0]); - w0_t[1] = swap32 (w0[1]); - w0_t[2] = swap32 (w0[2]); - w0_t[3] = swap32 (w0[3]); - w1_t[0] = swap32 (w1[0]); - w1_t[1] = swap32 (w1[1]); - w1_t[2] = swap32 (w1[2]); - w1_t[3] = swap32 (w1[3]); - w2_t[0] = swap32 (w2[0]); - w2_t[1] = swap32 (w2[1]); - w2_t[2] = swap32 (w2[2]); - w2_t[3] = swap32 (w2[3]); - w3_t[0] = swap32 (w3[0]); - w3_t[1] = swap32 (w3[1]); - w3_t[2] = swap32 (w3[2]); - w3_t[3] = swap32 (w3[3]); + wx[ 0] = w0_t[0]; + wx[ 1] = w0_t[1]; + wx[ 2] = w0_t[2]; + wx[ 3] = w0_t[3]; + wx[ 4] = w1_t[0]; + wx[ 5] = w1_t[1]; + wx[ 6] = w1_t[2]; + wx[ 7] = w1_t[3]; + wx[ 8] = w2_t[0]; + wx[ 9] = w2_t[1]; + wx[10] = w2_t[2]; + wx[11] = w2_t[3]; + wx[12] = w3_t[0]; + wx[13] = w3_t[1]; + wx[14] = w3_t[2]; + wx[15] = w3_t[3]; - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + overwrite_at_be (wx, w0lr, salt_len); - w0_t[0] |= salt_buf0[0]; - w0_t[1] |= salt_buf0[1]; - w0_t[2] |= salt_buf0[2]; - w0_t[3] |= salt_buf0[3]; - w1_t[0] |= salt_buf1[0]; - w1_t[1] |= salt_buf1[1]; - w1_t[2] |= salt_buf1[2]; - w1_t[3] |= salt_buf1[3]; - w2_t[0] |= salt_buf2[0]; - w2_t[1] |= salt_buf2[1]; - w2_t[2] |= salt_buf2[2]; - w2_t[3] |= salt_buf2[3]; - w3_t[0] |= salt_buf3[0]; - w3_t[1] |= salt_buf3[1]; - w3_t[2] = 0; - w3_t[3] = pw_salt_len * 8; + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; - /** - * sha512 - */ + w0_t[0] = wx[ 0]; + w0_t[1] = wx[ 1]; + w0_t[2] = wx[ 2]; + w0_t[3] = wx[ 3]; + w1_t[0] = wx[ 4]; + w1_t[1] = wx[ 5]; + w1_t[2] = wx[ 6]; + w1_t[3] = wx[ 7]; + w2_t[0] = wx[ 8]; + w2_t[1] = wx[ 9]; + w2_t[2] = wx[10]; + w2_t[3] = wx[11]; + w3_t[0] = wx[12]; + w3_t[1] = wx[13]; + w3_t[2] = 0; + w3_t[3] = pw_salt_len * 8; - w0_t[0] = swap32 (w0_t[0]); - w0_t[1] = swap32 (w0_t[1]); - w0_t[2] = swap32 (w0_t[2]); - w0_t[3] = swap32 (w0_t[3]); - w1_t[0] = swap32 (w1_t[0]); - w1_t[1] = swap32 (w1_t[1]); - w1_t[2] = swap32 (w1_t[2]); - w1_t[3] = swap32 (w1_t[3]); - w2_t[0] = swap32 (w2_t[0]); - w2_t[1] = swap32 (w2_t[1]); - w2_t[2] = swap32 (w2_t[2]); - w2_t[3] = swap32 (w2_t[3]); - w3_t[0] = swap32 (w3_t[0]); - w3_t[1] = swap32 (w3_t[1]); - //w3_t[2] = swap32 (w3_t[2]); - //w3_t[3] = swap32 (w3_t[3]); - - u64 digest[8]; + u64x digest[8]; digest[0] = SHA512M_A; digest[1] = SHA512M_B; @@ -435,13 +521,12 @@ static void m01720s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le sha512_transform (w0_t, w1_t, w2_t, w3_t, digest); + const u32x r0 = l32_from_64 (digest[7]); + const u32x r1 = h32_from_64 (digest[7]); + const u32x r2 = l32_from_64 (digest[3]); + const u32x r3 = h32_from_64 (digest[3]); - const u32 r0 = l32_from_64 (digest[7]); - const u32 r1 = h32_from_64 (digest[7]); - const u32 r2 = l32_from_64 (digest[3]); - const u32 r3 = h32_from_64 (digest[3]); - - #include COMPARE_S + COMPARE_S_SIMD (r0, r1, r2, r3); } } diff --git a/OpenCL/m01730_a0.cl b/OpenCL/m01730_a0.cl index 7b094b127..fa28e6205 100644 --- a/OpenCL/m01730_a0.cl +++ b/OpenCL/m01730_a0.cl @@ -263,7 +263,7 @@ __kernel void m01730_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, (out_len * 2)); + switch_buffer_by_offset_le (s0, s1, s2, s3, (out_len * 2)); const u32 out_salt_len = (out_len * 2) + salt_len; @@ -478,7 +478,7 @@ __kernel void m01730_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, (out_len * 2)); + switch_buffer_by_offset_le (s0, s1, s2, s3, (out_len * 2)); const u32 out_salt_len = (out_len * 2) + salt_len; diff --git a/OpenCL/m01730_a1.cl b/OpenCL/m01730_a1.cl index 5cead371d..fea7d8360 100644 --- a/OpenCL/m01730_a1.cl +++ b/OpenCL/m01730_a1.cl @@ -189,7 +189,7 @@ __kernel void m01730_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -246,7 +246,7 @@ __kernel void m01730_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -303,7 +303,7 @@ __kernel void m01730_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, (pw_len * 2)); + switch_buffer_by_offset_le (s0, s1, s2, s3, (pw_len * 2)); const u32 pw_salt_len = (pw_len * 2) + salt_len; @@ -434,7 +434,7 @@ __kernel void m01730_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -503,7 +503,7 @@ __kernel void m01730_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -560,7 +560,7 @@ __kernel void m01730_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, (pw_len * 2)); + switch_buffer_by_offset_le (s0, s1, s2, s3, (pw_len * 2)); const u32 pw_salt_len = (pw_len * 2) + salt_len; diff --git a/OpenCL/m01730_a3.cl b/OpenCL/m01730_a3.cl index 978a4a3f6..df629bbc7 100644 --- a/OpenCL/m01730_a3.cl +++ b/OpenCL/m01730_a3.cl @@ -5,6 +5,8 @@ #define _SHA512_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,9 +18,7 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" - -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" +#include "OpenCL/simd.c" __constant u64 k_sha512[80] = { @@ -44,33 +44,33 @@ __constant u64 k_sha512[80] = SHA512C4c, SHA512C4d, SHA512C4e, SHA512C4f, }; -static void sha512_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u64 digest[8]) +static void sha512_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u64x digest[8]) { - u64 w0_t = hl32_to_64 (w0[0], w0[1]); - u64 w1_t = hl32_to_64 (w0[2], w0[3]); - u64 w2_t = hl32_to_64 (w1[0], w1[1]); - u64 w3_t = hl32_to_64 (w1[2], w1[3]); - u64 w4_t = hl32_to_64 (w2[0], w2[1]); - u64 w5_t = hl32_to_64 (w2[2], w2[3]); - u64 w6_t = hl32_to_64 (w3[0], w3[1]); - u64 w7_t = 0; - u64 w8_t = 0; - u64 w9_t = 0; - u64 wa_t = 0; - u64 wb_t = 0; - u64 wc_t = 0; - u64 wd_t = 0; - u64 we_t = 0; - u64 wf_t = hl32_to_64 (w3[2], w3[3]); + u64x w0_t = hl32_to_64 (w0[0], w0[1]); + u64x w1_t = hl32_to_64 (w0[2], w0[3]); + u64x w2_t = hl32_to_64 (w1[0], w1[1]); + u64x w3_t = hl32_to_64 (w1[2], w1[3]); + u64x w4_t = hl32_to_64 (w2[0], w2[1]); + u64x w5_t = hl32_to_64 (w2[2], w2[3]); + u64x w6_t = hl32_to_64 (w3[0], w3[1]); + u64x w7_t = 0; + u64x w8_t = 0; + u64x w9_t = 0; + u64x wa_t = 0; + u64x wb_t = 0; + u64x wc_t = 0; + u64x wd_t = 0; + u64x we_t = 0; + u64x wf_t = hl32_to_64 (w3[2], w3[3]); - u64 a = digest[0]; - u64 b = digest[1]; - u64 c = digest[2]; - u64 d = digest[3]; - u64 e = digest[4]; - u64 f = digest[5]; - u64 g = digest[6]; - u64 h = digest[7]; + u64x a = digest[0]; + u64x b = digest[1]; + u64x c = digest[2]; + u64x d = digest[3]; + u64x e = digest[4]; + u64x f = digest[5]; + u64x g = digest[6]; + u64x h = digest[7]; #define ROUND_EXPAND() \ { \ @@ -141,7 +141,7 @@ static void sha512_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], digest[7] = h; } -static void m01730m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m01730m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -182,24 +182,24 @@ static void m01730m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k salt_buf3[2] = 0; salt_buf3[3] = 0; - switch_buffer_by_offset (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len); + switch_buffer_by_offset_le_S (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len); - w[ 0] |= swap32 (salt_buf0[0]); - w[ 1] |= swap32 (salt_buf0[1]); - w[ 2] |= swap32 (salt_buf0[2]); - w[ 3] |= swap32 (salt_buf0[3]); - w[ 4] |= swap32 (salt_buf1[0]); - w[ 5] |= swap32 (salt_buf1[1]); - w[ 6] |= swap32 (salt_buf1[2]); - w[ 7] |= swap32 (salt_buf1[3]); - w[ 8] |= swap32 (salt_buf2[0]); - w[ 9] |= swap32 (salt_buf2[1]); - w[10] |= swap32 (salt_buf2[2]); - w[11] |= swap32 (salt_buf2[3]); - w[12] |= swap32 (salt_buf3[0]); - w[13] |= swap32 (salt_buf3[1]); - w[14] |= swap32 (salt_buf3[2]); - w[15] |= swap32 (salt_buf3[3]); + w[ 0] |= swap32_S (salt_buf0[0]); + w[ 1] |= swap32_S (salt_buf0[1]); + w[ 2] |= swap32_S (salt_buf0[2]); + w[ 3] |= swap32_S (salt_buf0[3]); + w[ 4] |= swap32_S (salt_buf1[0]); + w[ 5] |= swap32_S (salt_buf1[1]); + w[ 6] |= swap32_S (salt_buf1[2]); + w[ 7] |= swap32_S (salt_buf1[3]); + w[ 8] |= swap32_S (salt_buf2[0]); + w[ 9] |= swap32_S (salt_buf2[1]); + w[10] |= swap32_S (salt_buf2[2]); + w[11] |= swap32_S (salt_buf2[3]); + w[12] |= swap32_S (salt_buf3[0]); + w[13] |= swap32_S (salt_buf3[1]); + w[14] |= swap32_S (salt_buf3[2]); + w[15] |= swap32_S (salt_buf3[3]); const u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -213,17 +213,16 @@ static void m01730m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - - u32 w0_t[4]; - u32 w1_t[4]; - u32 w2_t[4]; - u32 w3_t[4]; + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; w0_t[0] = w0; w0_t[1] = w[ 1]; @@ -242,7 +241,7 @@ static void m01730m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k w3_t[2] = w[14]; w3_t[3] = w[15]; - u64 digest[8]; + u64x digest[8]; digest[0] = SHA512M_A; digest[1] = SHA512M_B; @@ -255,17 +254,16 @@ static void m01730m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k sha512_transform (w0_t, w1_t, w2_t, w3_t, digest); + const u32x r0 = l32_from_64 (digest[7]); + const u32x r1 = h32_from_64 (digest[7]); + const u32x r2 = l32_from_64 (digest[3]); + const u32x r3 = h32_from_64 (digest[3]); - const u32 r0 = l32_from_64 (digest[7]); - const u32 r1 = h32_from_64 (digest[7]); - const u32 r2 = l32_from_64 (digest[3]); - const u32 r3 = h32_from_64 (digest[3]); - - #include COMPARE_M + COMPARE_M_SIMD (r0, r1, r2, r3); } } -static void m01730s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m01730s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -292,16 +290,16 @@ static void m01730s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - u32 w0_t[4]; - u32 w1_t[4]; - u32 w2_t[4]; - u32 w3_t[4]; + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; w0_t[0] = w0; w0_t[1] = w[ 1]; @@ -320,7 +318,7 @@ static void m01730s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k w3_t[2] = w[14]; w3_t[3] = w[15]; - u64 digest[8]; + u64x digest[8]; digest[0] = SHA512M_A; digest[1] = SHA512M_B; @@ -333,17 +331,16 @@ static void m01730s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k sha512_transform (w0_t, w1_t, w2_t, w3_t, digest); + const u32x r0 = l32_from_64 (digest[7]); + const u32x r1 = h32_from_64 (digest[7]); + const u32x r2 = l32_from_64 (digest[3]); + const u32x r3 = h32_from_64 (digest[3]); - const u32 r0 = l32_from_64 (digest[7]); - const u32 r1 = h32_from_64 (digest[7]); - const u32 r2 = l32_from_64 (digest[3]); - const u32 r3 = h32_from_64 (digest[3]); - - #include COMPARE_S + COMPARE_S_SIMD (r0, r1, r2, r3); } } -__kernel void m01730_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m01730_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -381,7 +378,7 @@ __kernel void m01730_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m01730m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m01730_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m01730_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -419,7 +416,7 @@ __kernel void m01730_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m01730m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m01730_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m01730_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -457,7 +454,7 @@ __kernel void m01730_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m01730m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m01730_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m01730_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -495,7 +492,7 @@ __kernel void m01730_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m01730s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m01730_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m01730_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -533,7 +530,7 @@ __kernel void m01730_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m01730s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m01730_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m01730_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base diff --git a/OpenCL/m01740_a0.cl b/OpenCL/m01740_a0.cl index 281397d7a..15f938911 100644 --- a/OpenCL/m01740_a0.cl +++ b/OpenCL/m01740_a0.cl @@ -239,7 +239,7 @@ __kernel void m01740_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, make_unicode (w0, w0_t, w1_t); make_unicode (w1, w2_t, w3_t); - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w0_t[0] |= salt_buf0[0]; w0_t[1] |= salt_buf0[1]; @@ -412,7 +412,7 @@ __kernel void m01740_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, make_unicode (w0, w0_t, w1_t); make_unicode (w1, w2_t, w3_t); - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w0_t[0] |= salt_buf0[0]; w0_t[1] |= salt_buf0[1]; diff --git a/OpenCL/m01740_a1.cl b/OpenCL/m01740_a1.cl index 29d1255fb..878a3816e 100644 --- a/OpenCL/m01740_a1.cl +++ b/OpenCL/m01740_a1.cl @@ -189,7 +189,7 @@ __kernel void m01740_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -246,7 +246,7 @@ __kernel void m01740_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -285,7 +285,7 @@ __kernel void m01740_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, make_unicode (w0, w0_t, w1_t); make_unicode (w1, w2_t, w3_t); - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w0_t[0] |= salt_buf0[0]; w0_t[1] |= salt_buf0[1]; @@ -398,7 +398,7 @@ __kernel void m01740_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -467,7 +467,7 @@ __kernel void m01740_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -506,7 +506,7 @@ __kernel void m01740_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, make_unicode (w0, w0_t, w1_t); make_unicode (w1, w2_t, w3_t); - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w0_t[0] |= salt_buf0[0]; w0_t[1] |= salt_buf0[1]; diff --git a/OpenCL/m01740_a3.cl b/OpenCL/m01740_a3.cl index 8d7c7c3af..857095d93 100644 --- a/OpenCL/m01740_a3.cl +++ b/OpenCL/m01740_a3.cl @@ -5,6 +5,8 @@ #define _SHA512_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,9 +18,7 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" - -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" +#include "OpenCL/simd.c" __constant u64 k_sha512[80] = { @@ -44,33 +44,33 @@ __constant u64 k_sha512[80] = SHA512C4c, SHA512C4d, SHA512C4e, SHA512C4f, }; -static void sha512_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u64 digest[8]) +static void sha512_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u64x digest[8]) { - u64 w0_t = hl32_to_64 (w0[0], w0[1]); - u64 w1_t = hl32_to_64 (w0[2], w0[3]); - u64 w2_t = hl32_to_64 (w1[0], w1[1]); - u64 w3_t = hl32_to_64 (w1[2], w1[3]); - u64 w4_t = hl32_to_64 (w2[0], w2[1]); - u64 w5_t = hl32_to_64 (w2[2], w2[3]); - u64 w6_t = hl32_to_64 (w3[0], w3[1]); - u64 w7_t = 0; - u64 w8_t = 0; - u64 w9_t = 0; - u64 wa_t = 0; - u64 wb_t = 0; - u64 wc_t = 0; - u64 wd_t = 0; - u64 we_t = 0; - u64 wf_t = hl32_to_64 (w3[2], w3[3]); + u64x w0_t = hl32_to_64 (w0[0], w0[1]); + u64x w1_t = hl32_to_64 (w0[2], w0[3]); + u64x w2_t = hl32_to_64 (w1[0], w1[1]); + u64x w3_t = hl32_to_64 (w1[2], w1[3]); + u64x w4_t = hl32_to_64 (w2[0], w2[1]); + u64x w5_t = hl32_to_64 (w2[2], w2[3]); + u64x w6_t = hl32_to_64 (w3[0], w3[1]); + u64x w7_t = 0; + u64x w8_t = 0; + u64x w9_t = 0; + u64x wa_t = 0; + u64x wb_t = 0; + u64x wc_t = 0; + u64x wd_t = 0; + u64x we_t = 0; + u64x wf_t = hl32_to_64 (w3[2], w3[3]); - u64 a = digest[0]; - u64 b = digest[1]; - u64 c = digest[2]; - u64 d = digest[3]; - u64 e = digest[4]; - u64 f = digest[5]; - u64 g = digest[6]; - u64 h = digest[7]; + u64x a = digest[0]; + u64x b = digest[1]; + u64x c = digest[2]; + u64x d = digest[3]; + u64x e = digest[4]; + u64x f = digest[5]; + u64x g = digest[6]; + u64x h = digest[7]; #define ROUND_EXPAND() \ { \ @@ -186,85 +186,129 @@ static void m01740m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le const u32 pw_salt_len = pw_len + salt_len; + + /** + * prepend salt + */ + + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; + + w0_t[0] = swap32_S (w0[0]); + w0_t[1] = swap32_S (w0[1]); + w0_t[2] = swap32_S (w0[2]); + w0_t[3] = swap32_S (w0[3]); + w1_t[0] = swap32_S (w1[0]); + w1_t[1] = swap32_S (w1[1]); + w1_t[2] = swap32_S (w1[2]); + w1_t[3] = swap32_S (w1[3]); + w2_t[0] = swap32_S (w2[0]); + w2_t[1] = swap32_S (w2[1]); + w2_t[2] = swap32_S (w2[2]); + w2_t[3] = swap32_S (w2[3]); + w3_t[0] = swap32_S (w3[0]); + w3_t[1] = swap32_S (w3[1]); + w3_t[2] = swap32_S (w3[2]); + w3_t[3] = swap32_S (w3[3]); + + switch_buffer_by_offset_le_S (w0_t, w1_t, w2_t, w3_t, salt_len); + + w0_t[0] |= salt_buf0[0]; + w0_t[1] |= salt_buf0[1]; + w0_t[2] |= salt_buf0[2]; + w0_t[3] |= salt_buf0[3]; + w1_t[0] |= salt_buf1[0]; + w1_t[1] |= salt_buf1[1]; + w1_t[2] |= salt_buf1[2]; + w1_t[3] |= salt_buf1[3]; + w2_t[0] |= salt_buf2[0]; + w2_t[1] |= salt_buf2[1]; + w2_t[2] |= salt_buf2[2]; + w2_t[3] |= salt_buf2[3]; + w3_t[0] |= salt_buf3[0]; + w3_t[1] |= salt_buf3[1]; + w3_t[2] |= salt_buf3[2]; + w3_t[3] |= salt_buf3[3]; + + w0_t[0] = swap32_S (w0_t[0]); + w0_t[1] = swap32_S (w0_t[1]); + w0_t[2] = swap32_S (w0_t[2]); + w0_t[3] = swap32_S (w0_t[3]); + w1_t[0] = swap32_S (w1_t[0]); + w1_t[1] = swap32_S (w1_t[1]); + w1_t[2] = swap32_S (w1_t[2]); + w1_t[3] = swap32_S (w1_t[3]); + w2_t[0] = swap32_S (w2_t[0]); + w2_t[1] = swap32_S (w2_t[1]); + w2_t[2] = swap32_S (w2_t[2]); + w2_t[3] = swap32_S (w2_t[3]); + w3_t[0] = swap32_S (w3_t[0]); + w3_t[1] = swap32_S (w3_t[1]); + w3_t[2] = swap32_S (w3_t[2]); + w3_t[3] = swap32_S (w3_t[3]); + /** * loop */ u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - /** - * prepend salt - */ + u32x wx[16]; - u32 w0_t[4]; - u32 w1_t[4]; - u32 w2_t[4]; - u32 w3_t[4]; + wx[ 0] = w0_t[0]; + wx[ 1] = w0_t[1]; + wx[ 2] = w0_t[2]; + wx[ 3] = w0_t[3]; + wx[ 4] = w1_t[0]; + wx[ 5] = w1_t[1]; + wx[ 6] = w1_t[2]; + wx[ 7] = w1_t[3]; + wx[ 8] = w2_t[0]; + wx[ 9] = w2_t[1]; + wx[10] = w2_t[2]; + wx[11] = w2_t[3]; + wx[12] = w3_t[0]; + wx[13] = w3_t[1]; + wx[14] = w3_t[2]; + wx[15] = w3_t[3]; - w0_t[0] = swap32 (w0[0]); - w0_t[1] = swap32 (w0[1]); - w0_t[2] = swap32 (w0[2]); - w0_t[3] = swap32 (w0[3]); - w1_t[0] = swap32 (w1[0]); - w1_t[1] = swap32 (w1[1]); - w1_t[2] = swap32 (w1[2]); - w1_t[3] = swap32 (w1[3]); - w2_t[0] = swap32 (w2[0]); - w2_t[1] = swap32 (w2[1]); - w2_t[2] = swap32 (w2[2]); - w2_t[3] = swap32 (w2[3]); - w3_t[0] = swap32 (w3[0]); - w3_t[1] = swap32 (w3[1]); - w3_t[2] = swap32 (w3[2]); - w3_t[3] = swap32 (w3[3]); + overwrite_at_be (wx, w0lr, salt_len); - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; - w0_t[0] |= salt_buf0[0]; - w0_t[1] |= salt_buf0[1]; - w0_t[2] |= salt_buf0[2]; - w0_t[3] |= salt_buf0[3]; - w1_t[0] |= salt_buf1[0]; - w1_t[1] |= salt_buf1[1]; - w1_t[2] |= salt_buf1[2]; - w1_t[3] |= salt_buf1[3]; - w2_t[0] |= salt_buf2[0]; - w2_t[1] |= salt_buf2[1]; - w2_t[2] |= salt_buf2[2]; - w2_t[3] |= salt_buf2[3]; - w3_t[0] |= salt_buf3[0]; - w3_t[1] |= salt_buf3[1]; - w3_t[2] = 0; - w3_t[3] = pw_salt_len * 8; + w0_t[0] = wx[ 0]; + w0_t[1] = wx[ 1]; + w0_t[2] = wx[ 2]; + w0_t[3] = wx[ 3]; + w1_t[0] = wx[ 4]; + w1_t[1] = wx[ 5]; + w1_t[2] = wx[ 6]; + w1_t[3] = wx[ 7]; + w2_t[0] = wx[ 8]; + w2_t[1] = wx[ 9]; + w2_t[2] = wx[10]; + w2_t[3] = wx[11]; + w3_t[0] = wx[12]; + w3_t[1] = wx[13]; + w3_t[2] = 0; + w3_t[3] = pw_salt_len * 8; /** * sha512 */ - w0_t[0] = swap32 (w0_t[0]); - w0_t[1] = swap32 (w0_t[1]); - w0_t[2] = swap32 (w0_t[2]); - w0_t[3] = swap32 (w0_t[3]); - w1_t[0] = swap32 (w1_t[0]); - w1_t[1] = swap32 (w1_t[1]); - w1_t[2] = swap32 (w1_t[2]); - w1_t[3] = swap32 (w1_t[3]); - w2_t[0] = swap32 (w2_t[0]); - w2_t[1] = swap32 (w2_t[1]); - w2_t[2] = swap32 (w2_t[2]); - w2_t[3] = swap32 (w2_t[3]); - w3_t[0] = swap32 (w3_t[0]); - w3_t[1] = swap32 (w3_t[1]); - //w3_t[2] = swap32 (w3_t[2]); - //w3_t[3] = swap32 (w3_t[3]); - - u64 digest[8]; + u64x digest[8]; digest[0] = SHA512M_A; digest[1] = SHA512M_B; @@ -277,13 +321,12 @@ static void m01740m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le sha512_transform (w0_t, w1_t, w2_t, w3_t, digest); + const u32x r0 = l32_from_64 (digest[7]); + const u32x r1 = h32_from_64 (digest[7]); + const u32x r2 = l32_from_64 (digest[3]); + const u32x r3 = h32_from_64 (digest[3]); - const u32 r0 = l32_from_64 (digest[7]); - const u32 r1 = h32_from_64 (digest[7]); - const u32 r2 = l32_from_64 (digest[3]); - const u32 r3 = h32_from_64 (digest[3]); - - #include COMPARE_M + COMPARE_M_SIMD (r0, r1, r2, r3); } } @@ -344,85 +387,128 @@ static void m01740s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le const u32 pw_salt_len = pw_len + salt_len; + /** + * prepend salt + */ + + u32 w0_t[4]; + u32 w1_t[4]; + u32 w2_t[4]; + u32 w3_t[4]; + + w0_t[0] = swap32_S (w0[0]); + w0_t[1] = swap32_S (w0[1]); + w0_t[2] = swap32_S (w0[2]); + w0_t[3] = swap32_S (w0[3]); + w1_t[0] = swap32_S (w1[0]); + w1_t[1] = swap32_S (w1[1]); + w1_t[2] = swap32_S (w1[2]); + w1_t[3] = swap32_S (w1[3]); + w2_t[0] = swap32_S (w2[0]); + w2_t[1] = swap32_S (w2[1]); + w2_t[2] = swap32_S (w2[2]); + w2_t[3] = swap32_S (w2[3]); + w3_t[0] = swap32_S (w3[0]); + w3_t[1] = swap32_S (w3[1]); + w3_t[2] = swap32_S (w3[2]); + w3_t[3] = swap32_S (w3[3]); + + switch_buffer_by_offset_le_S (w0_t, w1_t, w2_t, w3_t, salt_len); + + w0_t[0] |= salt_buf0[0]; + w0_t[1] |= salt_buf0[1]; + w0_t[2] |= salt_buf0[2]; + w0_t[3] |= salt_buf0[3]; + w1_t[0] |= salt_buf1[0]; + w1_t[1] |= salt_buf1[1]; + w1_t[2] |= salt_buf1[2]; + w1_t[3] |= salt_buf1[3]; + w2_t[0] |= salt_buf2[0]; + w2_t[1] |= salt_buf2[1]; + w2_t[2] |= salt_buf2[2]; + w2_t[3] |= salt_buf2[3]; + w3_t[0] |= salt_buf3[0]; + w3_t[1] |= salt_buf3[1]; + w3_t[2] |= salt_buf3[2]; + w3_t[3] |= salt_buf3[3]; + + w0_t[0] = swap32_S (w0_t[0]); + w0_t[1] = swap32_S (w0_t[1]); + w0_t[2] = swap32_S (w0_t[2]); + w0_t[3] = swap32_S (w0_t[3]); + w1_t[0] = swap32_S (w1_t[0]); + w1_t[1] = swap32_S (w1_t[1]); + w1_t[2] = swap32_S (w1_t[2]); + w1_t[3] = swap32_S (w1_t[3]); + w2_t[0] = swap32_S (w2_t[0]); + w2_t[1] = swap32_S (w2_t[1]); + w2_t[2] = swap32_S (w2_t[2]); + w2_t[3] = swap32_S (w2_t[3]); + w3_t[0] = swap32_S (w3_t[0]); + w3_t[1] = swap32_S (w3_t[1]); + w3_t[2] = swap32_S (w3_t[2]); + w3_t[3] = swap32_S (w3_t[3]); + /** * loop */ u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; /** * prepend salt */ - u32 w0_t[4]; - u32 w1_t[4]; - u32 w2_t[4]; - u32 w3_t[4]; + u32x wx[16]; - w0_t[0] = swap32 (w0[0]); - w0_t[1] = swap32 (w0[1]); - w0_t[2] = swap32 (w0[2]); - w0_t[3] = swap32 (w0[3]); - w1_t[0] = swap32 (w1[0]); - w1_t[1] = swap32 (w1[1]); - w1_t[2] = swap32 (w1[2]); - w1_t[3] = swap32 (w1[3]); - w2_t[0] = swap32 (w2[0]); - w2_t[1] = swap32 (w2[1]); - w2_t[2] = swap32 (w2[2]); - w2_t[3] = swap32 (w2[3]); - w3_t[0] = swap32 (w3[0]); - w3_t[1] = swap32 (w3[1]); - w3_t[2] = swap32 (w3[2]); - w3_t[3] = swap32 (w3[3]); + wx[ 0] = w0_t[0]; + wx[ 1] = w0_t[1]; + wx[ 2] = w0_t[2]; + wx[ 3] = w0_t[3]; + wx[ 4] = w1_t[0]; + wx[ 5] = w1_t[1]; + wx[ 6] = w1_t[2]; + wx[ 7] = w1_t[3]; + wx[ 8] = w2_t[0]; + wx[ 9] = w2_t[1]; + wx[10] = w2_t[2]; + wx[11] = w2_t[3]; + wx[12] = w3_t[0]; + wx[13] = w3_t[1]; + wx[14] = w3_t[2]; + wx[15] = w3_t[3]; - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + overwrite_at_be (wx, w0lr, salt_len); - w0_t[0] |= salt_buf0[0]; - w0_t[1] |= salt_buf0[1]; - w0_t[2] |= salt_buf0[2]; - w0_t[3] |= salt_buf0[3]; - w1_t[0] |= salt_buf1[0]; - w1_t[1] |= salt_buf1[1]; - w1_t[2] |= salt_buf1[2]; - w1_t[3] |= salt_buf1[3]; - w2_t[0] |= salt_buf2[0]; - w2_t[1] |= salt_buf2[1]; - w2_t[2] |= salt_buf2[2]; - w2_t[3] |= salt_buf2[3]; - w3_t[0] |= salt_buf3[0]; - w3_t[1] |= salt_buf3[1]; - w3_t[2] = 0; - w3_t[3] = pw_salt_len * 8; + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; - /** - * sha512 - */ + w0_t[0] = wx[ 0]; + w0_t[1] = wx[ 1]; + w0_t[2] = wx[ 2]; + w0_t[3] = wx[ 3]; + w1_t[0] = wx[ 4]; + w1_t[1] = wx[ 5]; + w1_t[2] = wx[ 6]; + w1_t[3] = wx[ 7]; + w2_t[0] = wx[ 8]; + w2_t[1] = wx[ 9]; + w2_t[2] = wx[10]; + w2_t[3] = wx[11]; + w3_t[0] = wx[12]; + w3_t[1] = wx[13]; + w3_t[2] = 0; + w3_t[3] = pw_salt_len * 8; - w0_t[0] = swap32 (w0_t[0]); - w0_t[1] = swap32 (w0_t[1]); - w0_t[2] = swap32 (w0_t[2]); - w0_t[3] = swap32 (w0_t[3]); - w1_t[0] = swap32 (w1_t[0]); - w1_t[1] = swap32 (w1_t[1]); - w1_t[2] = swap32 (w1_t[2]); - w1_t[3] = swap32 (w1_t[3]); - w2_t[0] = swap32 (w2_t[0]); - w2_t[1] = swap32 (w2_t[1]); - w2_t[2] = swap32 (w2_t[2]); - w2_t[3] = swap32 (w2_t[3]); - w3_t[0] = swap32 (w3_t[0]); - w3_t[1] = swap32 (w3_t[1]); - //w3_t[2] = swap32 (w3_t[2]); - //w3_t[3] = swap32 (w3_t[3]); - - u64 digest[8]; + u64x digest[8]; digest[0] = SHA512M_A; digest[1] = SHA512M_B; @@ -435,13 +521,12 @@ static void m01740s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le sha512_transform (w0_t, w1_t, w2_t, w3_t, digest); + const u32x r0 = l32_from_64 (digest[7]); + const u32x r1 = h32_from_64 (digest[7]); + const u32x r2 = l32_from_64 (digest[3]); + const u32x r3 = h32_from_64 (digest[3]); - const u32 r0 = l32_from_64 (digest[7]); - const u32 r1 = h32_from_64 (digest[7]); - const u32 r2 = l32_from_64 (digest[3]); - const u32 r3 = h32_from_64 (digest[3]); - - #include COMPARE_S + COMPARE_S_SIMD (r0, r1, r2, r3); } } diff --git a/OpenCL/m01750_a1.cl b/OpenCL/m01750_a1.cl index 0b18b1e4a..a14700f3a 100644 --- a/OpenCL/m01750_a1.cl +++ b/OpenCL/m01750_a1.cl @@ -306,7 +306,7 @@ __kernel void m01750_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -369,7 +369,7 @@ __kernel void m01750_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -524,7 +524,7 @@ __kernel void m01750_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -599,7 +599,7 @@ __kernel void m01750_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m01750_a3.cl b/OpenCL/m01750_a3.cl index e650a8e9a..879248fe5 100644 --- a/OpenCL/m01750_a3.cl +++ b/OpenCL/m01750_a3.cl @@ -5,6 +5,8 @@ #define _SHA512_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,9 +18,7 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" - -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" +#include "OpenCL/simd.c" __constant u64 k_sha512[80] = { @@ -44,33 +44,33 @@ __constant u64 k_sha512[80] = SHA512C4c, SHA512C4d, SHA512C4e, SHA512C4f, }; -static void sha512_transform (const u64 w0[4], const u64 w1[4], const u64 w2[4], const u64 w3[4], u64 digest[8]) +static void sha512_transform (const u64x w0[4], const u64x w1[4], const u64x w2[4], const u64x w3[4], u64x digest[8]) { - u64 w0_t = w0[0]; - u64 w1_t = w0[1]; - u64 w2_t = w0[2]; - u64 w3_t = w0[3]; - u64 w4_t = w1[0]; - u64 w5_t = w1[1]; - u64 w6_t = w1[2]; - u64 w7_t = w1[3]; - u64 w8_t = w2[0]; - u64 w9_t = w2[1]; - u64 wa_t = w2[2]; - u64 wb_t = w2[3]; - u64 wc_t = w3[0]; - u64 wd_t = w3[1]; - u64 we_t = w3[2]; - u64 wf_t = w3[3]; + u64x w0_t = w0[0]; + u64x w1_t = w0[1]; + u64x w2_t = w0[2]; + u64x w3_t = w0[3]; + u64x w4_t = w1[0]; + u64x w5_t = w1[1]; + u64x w6_t = w1[2]; + u64x w7_t = w1[3]; + u64x w8_t = w2[0]; + u64x w9_t = w2[1]; + u64x wa_t = w2[2]; + u64x wb_t = w2[3]; + u64x wc_t = w3[0]; + u64x wd_t = w3[1]; + u64x we_t = w3[2]; + u64x wf_t = w3[3]; - u64 a = digest[0]; - u64 b = digest[1]; - u64 c = digest[2]; - u64 d = digest[3]; - u64 e = digest[4]; - u64 f = digest[5]; - u64 g = digest[6]; - u64 h = digest[7]; + u64x a = digest[0]; + u64x b = digest[1]; + u64x c = digest[2]; + u64x d = digest[3]; + u64x e = digest[4]; + u64x f = digest[5]; + u64x g = digest[6]; + u64x h = digest[7]; #define ROUND_EXPAND() \ { \ @@ -130,12 +130,12 @@ static void sha512_transform (const u64 w0[4], const u64 w1[4], const u64 w2[4], digest[7] += h; } -static void hmac_sha512_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u64 ipad[8], u64 opad[8]) +static void hmac_sha512_pad (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u64x ipad[8], u64x opad[8]) { - u64 w0_t[4]; - u64 w1_t[4]; - u64 w2_t[4]; - u64 w3_t[4]; + u64x w0_t[4]; + u64x w1_t[4]; + u64x w2_t[4]; + u64x w3_t[4]; w0_t[0] = hl32_to_64 (w0[0], w0[1]) ^ 0x3636363636363636; w0_t[1] = hl32_to_64 (w0[2], w0[3]) ^ 0x3636363636363636; @@ -145,14 +145,14 @@ static void hmac_sha512_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u64 ipa w1_t[1] = hl32_to_64 (w2[2], w2[3]) ^ 0x3636363636363636; w1_t[2] = hl32_to_64 (w3[0], w3[1]) ^ 0x3636363636363636; w1_t[3] = hl32_to_64 (w3[2], w3[3]) ^ 0x3636363636363636; - w2_t[0] = 0x3636363636363636; - w2_t[1] = 0x3636363636363636; - w2_t[2] = 0x3636363636363636; - w2_t[3] = 0x3636363636363636; - w3_t[0] = 0x3636363636363636; - w3_t[1] = 0x3636363636363636; - w3_t[2] = 0x3636363636363636; - w3_t[3] = 0x3636363636363636; + w2_t[0] = 0x3636363636363636; + w2_t[1] = 0x3636363636363636; + w2_t[2] = 0x3636363636363636; + w2_t[3] = 0x3636363636363636; + w3_t[0] = 0x3636363636363636; + w3_t[1] = 0x3636363636363636; + w3_t[2] = 0x3636363636363636; + w3_t[3] = 0x3636363636363636; ipad[0] = SHA512M_A; ipad[1] = SHA512M_B; @@ -173,14 +173,14 @@ static void hmac_sha512_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u64 ipa w1_t[1] = hl32_to_64 (w2[2], w2[3]) ^ 0x5c5c5c5c5c5c5c5c; w1_t[2] = hl32_to_64 (w3[0], w3[1]) ^ 0x5c5c5c5c5c5c5c5c; w1_t[3] = hl32_to_64 (w3[2], w3[3]) ^ 0x5c5c5c5c5c5c5c5c; - w2_t[0] = 0x5c5c5c5c5c5c5c5c; - w2_t[1] = 0x5c5c5c5c5c5c5c5c; - w2_t[2] = 0x5c5c5c5c5c5c5c5c; - w2_t[3] = 0x5c5c5c5c5c5c5c5c; - w3_t[0] = 0x5c5c5c5c5c5c5c5c; - w3_t[1] = 0x5c5c5c5c5c5c5c5c; - w3_t[2] = 0x5c5c5c5c5c5c5c5c; - w3_t[3] = 0x5c5c5c5c5c5c5c5c; + w2_t[0] = 0x5c5c5c5c5c5c5c5c; + w2_t[1] = 0x5c5c5c5c5c5c5c5c; + w2_t[2] = 0x5c5c5c5c5c5c5c5c; + w2_t[3] = 0x5c5c5c5c5c5c5c5c; + w3_t[0] = 0x5c5c5c5c5c5c5c5c; + w3_t[1] = 0x5c5c5c5c5c5c5c5c; + w3_t[2] = 0x5c5c5c5c5c5c5c5c; + w3_t[3] = 0x5c5c5c5c5c5c5c5c; opad[0] = SHA512M_A; opad[1] = SHA512M_B; @@ -194,12 +194,12 @@ static void hmac_sha512_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u64 ipa sha512_transform (w0_t, w1_t, w2_t, w3_t, opad); } -static void hmac_sha512_run (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u64 ipad[8], u64 opad[8], u64 digest[8]) +static void hmac_sha512_run (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u64x ipad[8], u64x opad[8], u64x digest[8]) { - u64 w0_t[4]; - u64 w1_t[4]; - u64 w2_t[4]; - u64 w3_t[4]; + u64x w0_t[4]; + u64x w1_t[4]; + u64x w2_t[4]; + u64x w3_t[4]; w0_t[0] = hl32_to_64 (w0[0], w0[1]); w0_t[1] = hl32_to_64 (w0[2], w0[3]); @@ -293,46 +293,46 @@ static void m01750m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; /** * pads */ - u32 w0_t[4]; + u32x w0_t[4]; - w0_t[0] = w0[0]; + w0_t[0] = w0lr; w0_t[1] = w0[1]; w0_t[2] = w0[2]; w0_t[3] = w0[3]; - u32 w1_t[4]; + u32x w1_t[4]; w1_t[0] = w1[0]; w1_t[1] = w1[1]; w1_t[2] = w1[2]; w1_t[3] = w1[3]; - u32 w2_t[4]; + u32x w2_t[4]; w2_t[0] = w2[0]; w2_t[1] = w2[1]; w2_t[2] = w2[2]; w2_t[3] = w2[3]; - u32 w3_t[4]; + u32x w3_t[4]; w3_t[0] = w3[0]; w3_t[1] = w3[1]; w3_t[2] = 0; w3_t[3] = 0; - u64 ipad[8]; - u64 opad[8]; + u64x ipad[8]; + u64x opad[8]; hmac_sha512_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); @@ -353,17 +353,16 @@ static void m01750m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w3_t[2] = 0; w3_t[3] = (128 + salt_len) * 8; - u64 digest[8]; + u64x digest[8]; hmac_sha512_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); + const u32x r0 = l32_from_64 (digest[7]); + const u32x r1 = h32_from_64 (digest[7]); + const u32x r2 = l32_from_64 (digest[3]); + const u32x r3 = h32_from_64 (digest[3]); - const u32 r0 = l32_from_64 (digest[7]); - const u32 r1 = h32_from_64 (digest[7]); - const u32 r2 = l32_from_64 (digest[3]); - const u32 r3 = h32_from_64 (digest[3]); - - #include COMPARE_M + COMPARE_M_SIMD (r0, r1, r2, r3); } } @@ -414,46 +413,46 @@ static void m01750s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; /** * pads */ - u32 w0_t[4]; + u32x w0_t[4]; - w0_t[0] = w0[0]; + w0_t[0] = w0lr; w0_t[1] = w0[1]; w0_t[2] = w0[2]; w0_t[3] = w0[3]; - u32 w1_t[4]; + u32x w1_t[4]; w1_t[0] = w1[0]; w1_t[1] = w1[1]; w1_t[2] = w1[2]; w1_t[3] = w1[3]; - u32 w2_t[4]; + u32x w2_t[4]; w2_t[0] = w2[0]; w2_t[1] = w2[1]; w2_t[2] = w2[2]; w2_t[3] = w2[3]; - u32 w3_t[4]; + u32x w3_t[4]; w3_t[0] = w3[0]; w3_t[1] = w3[1]; w3_t[2] = 0; w3_t[3] = 0; - u64 ipad[8]; - u64 opad[8]; + u64x ipad[8]; + u64x opad[8]; hmac_sha512_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); @@ -474,17 +473,16 @@ static void m01750s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w3_t[2] = 0; w3_t[3] = (128 + salt_len) * 8; - u64 digest[8]; + u64x digest[8]; hmac_sha512_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); + const u32x r0 = l32_from_64 (digest[7]); + const u32x r1 = h32_from_64 (digest[7]); + const u32x r2 = l32_from_64 (digest[3]); + const u32x r3 = h32_from_64 (digest[3]); - const u32 r0 = l32_from_64 (digest[7]); - const u32 r1 = h32_from_64 (digest[7]); - const u32 r2 = l32_from_64 (digest[3]); - const u32 r3 = h32_from_64 (digest[3]); - - #include COMPARE_S + COMPARE_S_SIMD (r0, r1, r2, r3); } } diff --git a/OpenCL/m01760_a1.cl b/OpenCL/m01760_a1.cl index ac445ce2b..78d152766 100644 --- a/OpenCL/m01760_a1.cl +++ b/OpenCL/m01760_a1.cl @@ -306,7 +306,7 @@ __kernel void m01760_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -404,7 +404,7 @@ __kernel void m01760_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -524,7 +524,7 @@ __kernel void m01760_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -634,7 +634,7 @@ __kernel void m01760_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m01760_a3.cl b/OpenCL/m01760_a3.cl index 768384e45..461a733d6 100644 --- a/OpenCL/m01760_a3.cl +++ b/OpenCL/m01760_a3.cl @@ -5,6 +5,8 @@ #define _SHA512_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,9 +18,7 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" - -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" +#include "OpenCL/simd.c" __constant u64 k_sha512[80] = { @@ -44,33 +44,33 @@ __constant u64 k_sha512[80] = SHA512C4c, SHA512C4d, SHA512C4e, SHA512C4f, }; -static void sha512_transform (const u64 w0[4], const u64 w1[4], const u64 w2[4], const u64 w3[4], u64 digest[8]) +static void sha512_transform (const u64x w0[4], const u64x w1[4], const u64x w2[4], const u64x w3[4], u64x digest[8]) { - u64 w0_t = w0[0]; - u64 w1_t = w0[1]; - u64 w2_t = w0[2]; - u64 w3_t = w0[3]; - u64 w4_t = w1[0]; - u64 w5_t = w1[1]; - u64 w6_t = w1[2]; - u64 w7_t = w1[3]; - u64 w8_t = w2[0]; - u64 w9_t = w2[1]; - u64 wa_t = w2[2]; - u64 wb_t = w2[3]; - u64 wc_t = w3[0]; - u64 wd_t = w3[1]; - u64 we_t = w3[2]; - u64 wf_t = w3[3]; + u64x w0_t = w0[0]; + u64x w1_t = w0[1]; + u64x w2_t = w0[2]; + u64x w3_t = w0[3]; + u64x w4_t = w1[0]; + u64x w5_t = w1[1]; + u64x w6_t = w1[2]; + u64x w7_t = w1[3]; + u64x w8_t = w2[0]; + u64x w9_t = w2[1]; + u64x wa_t = w2[2]; + u64x wb_t = w2[3]; + u64x wc_t = w3[0]; + u64x wd_t = w3[1]; + u64x we_t = w3[2]; + u64x wf_t = w3[3]; - u64 a = digest[0]; - u64 b = digest[1]; - u64 c = digest[2]; - u64 d = digest[3]; - u64 e = digest[4]; - u64 f = digest[5]; - u64 g = digest[6]; - u64 h = digest[7]; + u64x a = digest[0]; + u64x b = digest[1]; + u64x c = digest[2]; + u64x d = digest[3]; + u64x e = digest[4]; + u64x f = digest[5]; + u64x g = digest[6]; + u64x h = digest[7]; #define ROUND_EXPAND() \ { \ @@ -130,12 +130,12 @@ static void sha512_transform (const u64 w0[4], const u64 w1[4], const u64 w2[4], digest[7] += h; } -static void hmac_sha512_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u64 ipad[8], u64 opad[8]) +static void hmac_sha512_pad (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u64x ipad[8], u64x opad[8]) { - u64 w0_t[4]; - u64 w1_t[4]; - u64 w2_t[4]; - u64 w3_t[4]; + u64x w0_t[4]; + u64x w1_t[4]; + u64x w2_t[4]; + u64x w3_t[4]; w0_t[0] = hl32_to_64 (w0[0], w0[1]) ^ 0x3636363636363636; w0_t[1] = hl32_to_64 (w0[2], w0[3]) ^ 0x3636363636363636; @@ -145,14 +145,14 @@ static void hmac_sha512_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u64 ipa w1_t[1] = hl32_to_64 (w2[2], w2[3]) ^ 0x3636363636363636; w1_t[2] = hl32_to_64 (w3[0], w3[1]) ^ 0x3636363636363636; w1_t[3] = hl32_to_64 (w3[2], w3[3]) ^ 0x3636363636363636; - w2_t[0] = 0 ^ 0x3636363636363636; - w2_t[1] = 0 ^ 0x3636363636363636; - w2_t[2] = 0 ^ 0x3636363636363636; - w2_t[3] = 0 ^ 0x3636363636363636; - w3_t[0] = 0 ^ 0x3636363636363636; - w3_t[1] = 0 ^ 0x3636363636363636; - w3_t[2] = 0 ^ 0x3636363636363636; - w3_t[3] = 0 ^ 0x3636363636363636; + w2_t[0] = 0x3636363636363636; + w2_t[1] = 0x3636363636363636; + w2_t[2] = 0x3636363636363636; + w2_t[3] = 0x3636363636363636; + w3_t[0] = 0x3636363636363636; + w3_t[1] = 0x3636363636363636; + w3_t[2] = 0x3636363636363636; + w3_t[3] = 0x3636363636363636; ipad[0] = SHA512M_A; ipad[1] = SHA512M_B; @@ -173,14 +173,14 @@ static void hmac_sha512_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u64 ipa w1_t[1] = hl32_to_64 (w2[2], w2[3]) ^ 0x5c5c5c5c5c5c5c5c; w1_t[2] = hl32_to_64 (w3[0], w3[1]) ^ 0x5c5c5c5c5c5c5c5c; w1_t[3] = hl32_to_64 (w3[2], w3[3]) ^ 0x5c5c5c5c5c5c5c5c; - w2_t[0] = 0 ^ 0x5c5c5c5c5c5c5c5c; - w2_t[1] = 0 ^ 0x5c5c5c5c5c5c5c5c; - w2_t[2] = 0 ^ 0x5c5c5c5c5c5c5c5c; - w2_t[3] = 0 ^ 0x5c5c5c5c5c5c5c5c; - w3_t[0] = 0 ^ 0x5c5c5c5c5c5c5c5c; - w3_t[1] = 0 ^ 0x5c5c5c5c5c5c5c5c; - w3_t[2] = 0 ^ 0x5c5c5c5c5c5c5c5c; - w3_t[3] = 0 ^ 0x5c5c5c5c5c5c5c5c; + w2_t[0] = 0x5c5c5c5c5c5c5c5c; + w2_t[1] = 0x5c5c5c5c5c5c5c5c; + w2_t[2] = 0x5c5c5c5c5c5c5c5c; + w2_t[3] = 0x5c5c5c5c5c5c5c5c; + w3_t[0] = 0x5c5c5c5c5c5c5c5c; + w3_t[1] = 0x5c5c5c5c5c5c5c5c; + w3_t[2] = 0x5c5c5c5c5c5c5c5c; + w3_t[3] = 0x5c5c5c5c5c5c5c5c; opad[0] = SHA512M_A; opad[1] = SHA512M_B; @@ -194,12 +194,12 @@ static void hmac_sha512_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u64 ipa sha512_transform (w0_t, w1_t, w2_t, w3_t, opad); } -static void hmac_sha512_run (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u64 ipad[8], u64 opad[8], u64 digest[8]) +static void hmac_sha512_run (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u64x ipad[8], u64x opad[8], u64x digest[8]) { - u64 w0_t[4]; - u64 w1_t[4]; - u64 w2_t[4]; - u64 w3_t[4]; + u64x w0_t[4]; + u64x w1_t[4]; + u64x w2_t[4]; + u64x w3_t[4]; w0_t[0] = hl32_to_64 (w0[0], w0[1]); w0_t[1] = hl32_to_64 (w0[2], w0[3]); @@ -289,36 +289,36 @@ static void m01760m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * pads */ - u32 w0_t[4]; + u32x w0_t[4]; w0_t[0] = swap32 (salt_buf0[0]); w0_t[1] = swap32 (salt_buf0[1]); w0_t[2] = swap32 (salt_buf0[2]); w0_t[3] = swap32 (salt_buf0[3]); - u32 w1_t[4]; + u32x w1_t[4]; w1_t[0] = swap32 (salt_buf1[0]); w1_t[1] = swap32 (salt_buf1[1]); w1_t[2] = swap32 (salt_buf1[2]); w1_t[3] = swap32 (salt_buf1[3]); - u32 w2_t[4]; + u32x w2_t[4]; w2_t[0] = 0; w2_t[1] = 0; w2_t[2] = 0; w2_t[3] = 0; - u32 w3_t[4]; + u32x w3_t[4]; w3_t[0] = 0; w3_t[1] = 0; w3_t[2] = 0; w3_t[3] = 0; - u64 ipad[8]; - u64 opad[8]; + u64x ipad[8]; + u64x opad[8]; hmac_sha512_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); @@ -328,13 +328,13 @@ static void m01760m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - w0_t[0] = w0[0]; + w0_t[0] = w0lr; w0_t[1] = w0[1]; w0_t[2] = w0[2]; w0_t[3] = w0[3]; @@ -351,17 +351,16 @@ static void m01760m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w3_t[2] = 0; w3_t[3] = (128 + pw_len) * 8; - u64 digest[8]; + u64x digest[8]; hmac_sha512_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); + const u32x r0 = l32_from_64 (digest[7]); + const u32x r1 = h32_from_64 (digest[7]); + const u32x r2 = l32_from_64 (digest[3]); + const u32x r3 = h32_from_64 (digest[3]); - const u32 r0 = l32_from_64 (digest[7]); - const u32 r1 = h32_from_64 (digest[7]); - const u32 r2 = l32_from_64 (digest[3]); - const u32 r3 = h32_from_64 (digest[3]); - - #include COMPARE_M + COMPARE_M_SIMD (r0, r1, r2, r3); } } @@ -396,36 +395,36 @@ static void m01760s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * pads */ - u32 w0_t[4]; + u32x w0_t[4]; w0_t[0] = swap32 (salt_buf0[0]); w0_t[1] = swap32 (salt_buf0[1]); w0_t[2] = swap32 (salt_buf0[2]); w0_t[3] = swap32 (salt_buf0[3]); - u32 w1_t[4]; + u32x w1_t[4]; w1_t[0] = swap32 (salt_buf1[0]); w1_t[1] = swap32 (salt_buf1[1]); w1_t[2] = swap32 (salt_buf1[2]); w1_t[3] = swap32 (salt_buf1[3]); - u32 w2_t[4]; + u32x w2_t[4]; w2_t[0] = 0; w2_t[1] = 0; w2_t[2] = 0; w2_t[3] = 0; - u32 w3_t[4]; + u32x w3_t[4]; w3_t[0] = 0; w3_t[1] = 0; w3_t[2] = 0; w3_t[3] = 0; - u64 ipad[8]; - u64 opad[8]; + u64x ipad[8]; + u64x opad[8]; hmac_sha512_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); @@ -447,13 +446,13 @@ static void m01760s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - w0_t[0] = w0[0]; + w0_t[0] = w0lr; w0_t[1] = w0[1]; w0_t[2] = w0[2]; w0_t[3] = w0[3]; @@ -470,17 +469,16 @@ static void m01760s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w3_t[2] = 0; w3_t[3] = (128 + pw_len) * 8; - u64 digest[8]; + u64x digest[8]; hmac_sha512_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); + const u32x r0 = l32_from_64 (digest[7]); + const u32x r1 = h32_from_64 (digest[7]); + const u32x r2 = l32_from_64 (digest[3]); + const u32x r3 = h32_from_64 (digest[3]); - const u32 r0 = l32_from_64 (digest[7]); - const u32 r1 = h32_from_64 (digest[7]); - const u32 r2 = l32_from_64 (digest[3]); - const u32 r3 = h32_from_64 (digest[3]); - - #include COMPARE_S + COMPARE_S_SIMD (r0, r1, r2, r3); } } diff --git a/OpenCL/m02400_a1.cl b/OpenCL/m02400_a1.cl index ce37c8fa0..7ce829855 100644 --- a/OpenCL/m02400_a1.cl +++ b/OpenCL/m02400_a1.cl @@ -68,7 +68,7 @@ __kernel void m02400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -111,7 +111,7 @@ __kernel void m02400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -292,7 +292,7 @@ __kernel void m02400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -347,7 +347,7 @@ __kernel void m02400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m02400_a3.cl b/OpenCL/m02400_a3.cl index 2e1fbff07..62cc11486 100644 --- a/OpenCL/m02400_a3.cl +++ b/OpenCL/m02400_a3.cl @@ -5,6 +5,8 @@ #define _MD5_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,11 +18,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" +#include "OpenCL/simd.c" -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" - -static void m02400m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m02400m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -114,18 +114,18 @@ static void m02400m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - u32 tmp2; + u32x tmp2; - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32x a = MD5M_A; + u32x b = MD5M_B; + u32x c = MD5M_C; + u32x d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0, F_w0c00, MD5S00); MD5_STEP0(MD5_Fo, d, a, b, c, F_w1c01, MD5S01); @@ -200,16 +200,11 @@ static void m02400m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k c &= 0x00ffffff; b &= 0x00ffffff; - const u32 r0 = a; - const u32 r1 = d; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_M + COMPARE_M_SIMD (a, d, c, b); } } -static void m02400s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m02400s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -315,18 +310,18 @@ static void m02400s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - u32 tmp2; + u32x tmp2; - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32x a = MD5M_A; + u32x b = MD5M_B; + u32x c = MD5M_C; + u32x d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0, F_w0c00, MD5S00); MD5_STEP0(MD5_Fo, d, a, b, c, F_w1c01, MD5S01); @@ -393,9 +388,7 @@ static void m02400s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k MD5_STEP0(MD5_I , b, c, d, a, I_wdc3b, MD5S33); MD5_STEP0(MD5_I , a, b, c, d, I_w4c3c, MD5S30); - bool q_cond = allx ((a & 0x00ffffff) != search[0]); - - if (q_cond) continue; + if (MATCHES_NONE_VS ((a & 0x00ffffff), search[0])) continue; MD5_STEP0(MD5_I , d, a, b, c, I_wbc3d, MD5S31); MD5_STEP0(MD5_I , c, d, a, b, I_w2c3e, MD5S32); @@ -406,16 +399,11 @@ static void m02400s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k c &= 0x00ffffff; b &= 0x00ffffff; - const u32 r0 = a; - const u32 r1 = d; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_S + COMPARE_S_SIMD (a, d, c, b); } } -__kernel void m02400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m02400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -453,15 +441,15 @@ __kernel void m02400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m02400m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m02400_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m02400_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { } -__kernel void m02400_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m02400_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { } -__kernel void m02400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m02400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -499,10 +487,10 @@ __kernel void m02400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m02400s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m02400_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m02400_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { } -__kernel void m02400_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m02400_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { } diff --git a/OpenCL/m02410_a0.cl b/OpenCL/m02410_a0.cl index 099ed4696..5e55639cd 100644 --- a/OpenCL/m02410_a0.cl +++ b/OpenCL/m02410_a0.cl @@ -128,7 +128,7 @@ __kernel void m02410_m04 (__global pw_t *pws, __global kernel_rule_t * rules_bu s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, out_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, out_len); w0[0] |= s0[0]; w0[1] |= s0[1]; @@ -357,7 +357,7 @@ __kernel void m02410_s04 (__global pw_t *pws, __global kernel_rule_t * rules_bu s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, out_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, out_len); w0[0] |= s0[0]; w0[1] |= s0[1]; diff --git a/OpenCL/m02410_a1.cl b/OpenCL/m02410_a1.cl index 6fe15f592..c1d176fbe 100644 --- a/OpenCL/m02410_a1.cl +++ b/OpenCL/m02410_a1.cl @@ -68,7 +68,7 @@ __kernel void m02410_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -124,7 +124,7 @@ __kernel void m02410_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } /** @@ -159,7 +159,7 @@ __kernel void m02410_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, pw_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len); const u32 pw_salt_len = pw_len + salt_len; @@ -341,7 +341,7 @@ __kernel void m02410_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -409,7 +409,7 @@ __kernel void m02410_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } /** @@ -444,7 +444,7 @@ __kernel void m02410_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, pw_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len); const u32 pw_salt_len = pw_len + salt_len; diff --git a/OpenCL/m02410_a3.cl b/OpenCL/m02410_a3.cl index e8fd1e79f..20edd68ef 100644 --- a/OpenCL/m02410_a3.cl +++ b/OpenCL/m02410_a3.cl @@ -5,6 +5,8 @@ #define _MD5_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,11 +18,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" +#include "OpenCL/simd.c" -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" - -static void m02410m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m02410m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -61,7 +61,7 @@ static void m02410m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k salt_buf3[2] = 0; salt_buf3[3] = 0; - switch_buffer_by_offset (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len); + switch_buffer_by_offset_le_S (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len); w[0] |= salt_buf0[0]; w[1] |= salt_buf0[1]; @@ -72,7 +72,7 @@ static void m02410m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k const u32 pw_salt_len = pw_len + salt_len; - truncate_block (w, pw_salt_len); + truncate_block_S (w, pw_salt_len); /** * algorithm specific @@ -159,18 +159,18 @@ static void m02410m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - u32 tmp2; + u32x tmp2; - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32x a = MD5M_A; + u32x b = MD5M_B; + u32x c = MD5M_C; + u32x d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0, F_w0c00, MD5S00); MD5_STEP0(MD5_Fo, d, a, b, c, F_w1c01, MD5S01); @@ -245,16 +245,11 @@ static void m02410m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k c &= 0x00ffffff; b &= 0x00ffffff; - const u32 r0 = a; - const u32 r1 = d; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_M + COMPARE_M_SIMD (a, d, c, b); } } -static void m02410s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m02410s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -295,7 +290,7 @@ static void m02410s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k salt_buf3[2] = 0; salt_buf3[3] = 0; - switch_buffer_by_offset (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len); + switch_buffer_by_offset_le_S (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len); w[0] |= salt_buf0[0]; w[1] |= salt_buf0[1]; @@ -306,7 +301,7 @@ static void m02410s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k const u32 pw_salt_len = pw_len + salt_len; - truncate_block (w, pw_salt_len); + truncate_block_S (w, pw_salt_len); /** * algorithm specific @@ -405,18 +400,18 @@ static void m02410s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - u32 tmp2; + u32x tmp2; - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32x a = MD5M_A; + u32x b = MD5M_B; + u32x c = MD5M_C; + u32x d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0, F_w0c00, MD5S00); MD5_STEP0(MD5_Fo, d, a, b, c, F_w1c01, MD5S01); @@ -483,9 +478,7 @@ static void m02410s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k MD5_STEP0(MD5_I , b, c, d, a, I_wdc3b, MD5S33); MD5_STEP0(MD5_I , a, b, c, d, I_w4c3c, MD5S30); - bool q_cond = allx ((a & 0x00ffffff) != search[0]); - - if (q_cond) continue; + if (MATCHES_NONE_VS ((a & 0x00ffffff), search[0])) continue; MD5_STEP0(MD5_I , d, a, b, c, I_wbc3d, MD5S31); MD5_STEP0(MD5_I , c, d, a, b, I_w2c3e, MD5S32); @@ -496,16 +489,11 @@ static void m02410s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k c &= 0x00ffffff; b &= 0x00ffffff; - const u32 r0 = a; - const u32 r1 = d; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_S + COMPARE_S_SIMD (a, d, c, b); } } -__kernel void m02410_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m02410_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -543,15 +531,15 @@ __kernel void m02410_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m02410m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m02410_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m02410_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { } -__kernel void m02410_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m02410_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { } -__kernel void m02410_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m02410_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -589,10 +577,10 @@ __kernel void m02410_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m02410s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m02410_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m02410_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { } -__kernel void m02410_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m02410_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { } diff --git a/OpenCL/m02610_a1.cl b/OpenCL/m02610_a1.cl index 29afcbedf..ade14277c 100644 --- a/OpenCL/m02610_a1.cl +++ b/OpenCL/m02610_a1.cl @@ -70,7 +70,7 @@ __kernel void m02610_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -149,7 +149,7 @@ __kernel void m02610_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -422,7 +422,7 @@ __kernel void m02610_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -513,7 +513,7 @@ __kernel void m02610_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m02610_a3.cl b/OpenCL/m02610_a3.cl index dcbf8fa67..07a1f11ef 100644 --- a/OpenCL/m02610_a3.cl +++ b/OpenCL/m02610_a3.cl @@ -5,6 +5,8 @@ #define _MD5_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,11 +18,17 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" +#include "OpenCL/simd.c" -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" - -#define uint_to_hex_lower8(i) l_bin2asc[(i)] +#if VECT_SIZE == 1 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#endif static void m02610m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 l_bin2asc[256]) { @@ -54,195 +62,219 @@ static void m02610m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32x w0_t[4]; - MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3[3], MD5C0f, MD5S03); + w0_t[0] = w0lr; + w0_t[1] = w0[1]; + w0_t[2] = w0[2]; + w0_t[3] = w0[3]; - MD5_STEP (MD5_Go, a, b, c, d, w0[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3[0], MD5C1f, MD5S13); + u32x w1_t[4]; - MD5_STEP (MD5_H , a, b, c, d, w1[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0[2], MD5C2f, MD5S23); + w1_t[0] = w1[0]; + w1_t[1] = w1[1]; + w1_t[2] = w1[2]; + w1_t[3] = w1[3]; - MD5_STEP (MD5_I , a, b, c, d, w0[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2[1], MD5C3f, MD5S33); + u32x w2_t[4]; + + w2_t[0] = w2[0]; + w2_t[1] = w2[1]; + w2_t[2] = w2[2]; + w2_t[3] = w2[3]; + + u32x w3_t[4]; + + w3_t[0] = w3[0]; + w3_t[1] = w3[1]; + w3_t[2] = w3[2]; + w3_t[3] = w3[3]; + + u32x a = MD5M_A; + u32x b = MD5M_B; + u32x c = MD5M_C; + u32x d = MD5M_D; + + MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); + + MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); + + MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); + + MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); a += MD5M_A; b += MD5M_B; c += MD5M_C; d += MD5M_D; - const u32 w0_t = uint_to_hex_lower8 ((a >> 0) & 255) << 0 - | uint_to_hex_lower8 ((a >> 8) & 255) << 16; - const u32 w1_t = uint_to_hex_lower8 ((a >> 16) & 255) << 0 - | uint_to_hex_lower8 ((a >> 24) & 255) << 16; - const u32 w2_t = uint_to_hex_lower8 ((b >> 0) & 255) << 0 - | uint_to_hex_lower8 ((b >> 8) & 255) << 16; - const u32 w3_t = uint_to_hex_lower8 ((b >> 16) & 255) << 0 - | uint_to_hex_lower8 ((b >> 24) & 255) << 16; - const u32 w4_t = uint_to_hex_lower8 ((c >> 0) & 255) << 0 - | uint_to_hex_lower8 ((c >> 8) & 255) << 16; - const u32 w5_t = uint_to_hex_lower8 ((c >> 16) & 255) << 0 - | uint_to_hex_lower8 ((c >> 24) & 255) << 16; - const u32 w6_t = uint_to_hex_lower8 ((d >> 0) & 255) << 0 - | uint_to_hex_lower8 ((d >> 8) & 255) << 16; - const u32 w7_t = uint_to_hex_lower8 ((d >> 16) & 255) << 0 - | uint_to_hex_lower8 ((d >> 24) & 255) << 16; + w0_t[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 + | uint_to_hex_lower8 ((a >> 8) & 255) << 16; + w0_t[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 + | uint_to_hex_lower8 ((a >> 24) & 255) << 16; + w0_t[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 + | uint_to_hex_lower8 ((b >> 8) & 255) << 16; + w0_t[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 + | uint_to_hex_lower8 ((b >> 24) & 255) << 16; + w1_t[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 + | uint_to_hex_lower8 ((c >> 8) & 255) << 16; + w1_t[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 + | uint_to_hex_lower8 ((c >> 24) & 255) << 16; + w1_t[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 + | uint_to_hex_lower8 ((d >> 8) & 255) << 16; + w1_t[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 + | uint_to_hex_lower8 ((d >> 24) & 255) << 16; - const u32 w8_t = s[0]; - const u32 w9_t = s[1]; - const u32 wa_t = s[2]; - const u32 wb_t = s[3]; - const u32 wc_t = s[4]; - const u32 wd_t = s[5]; - const u32 we_t = s[6]; - const u32 wf_t = s[7]; + w2_t[0] = s[0]; + w2_t[1] = s[1]; + w2_t[2] = s[2]; + w2_t[3] = s[3]; + + w3_t[0] = s[4]; + w3_t[1] = s[5]; + w3_t[2] = s[6]; + w3_t[3] = s[7]; a = MD5M_A; b = MD5M_B; c = MD5M_C; d = MD5M_D; - MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - const u32 r0 = a; - const u32 r1 = d; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_M + COMPARE_M_SIMD (a, d, c, b); } } @@ -290,195 +322,219 @@ static void m02610s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32x w0_t[4]; - MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3[3], MD5C0f, MD5S03); + w0_t[0] = w0lr; + w0_t[1] = w0[1]; + w0_t[2] = w0[2]; + w0_t[3] = w0[3]; - MD5_STEP (MD5_Go, a, b, c, d, w0[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3[0], MD5C1f, MD5S13); + u32x w1_t[4]; - MD5_STEP (MD5_H , a, b, c, d, w1[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0[2], MD5C2f, MD5S23); + w1_t[0] = w1[0]; + w1_t[1] = w1[1]; + w1_t[2] = w1[2]; + w1_t[3] = w1[3]; - MD5_STEP (MD5_I , a, b, c, d, w0[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2[1], MD5C3f, MD5S33); + u32x w2_t[4]; + + w2_t[0] = w2[0]; + w2_t[1] = w2[1]; + w2_t[2] = w2[2]; + w2_t[3] = w2[3]; + + u32x w3_t[4]; + + w3_t[0] = w3[0]; + w3_t[1] = w3[1]; + w3_t[2] = w3[2]; + w3_t[3] = w3[3]; + + u32x a = MD5M_A; + u32x b = MD5M_B; + u32x c = MD5M_C; + u32x d = MD5M_D; + + MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); + + MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); + + MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); + + MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); a += MD5M_A; b += MD5M_B; c += MD5M_C; d += MD5M_D; - const u32 w0_t = uint_to_hex_lower8 ((a >> 0) & 255) << 0 - | uint_to_hex_lower8 ((a >> 8) & 255) << 16; - const u32 w1_t = uint_to_hex_lower8 ((a >> 16) & 255) << 0 - | uint_to_hex_lower8 ((a >> 24) & 255) << 16; - const u32 w2_t = uint_to_hex_lower8 ((b >> 0) & 255) << 0 - | uint_to_hex_lower8 ((b >> 8) & 255) << 16; - const u32 w3_t = uint_to_hex_lower8 ((b >> 16) & 255) << 0 - | uint_to_hex_lower8 ((b >> 24) & 255) << 16; - const u32 w4_t = uint_to_hex_lower8 ((c >> 0) & 255) << 0 - | uint_to_hex_lower8 ((c >> 8) & 255) << 16; - const u32 w5_t = uint_to_hex_lower8 ((c >> 16) & 255) << 0 - | uint_to_hex_lower8 ((c >> 24) & 255) << 16; - const u32 w6_t = uint_to_hex_lower8 ((d >> 0) & 255) << 0 - | uint_to_hex_lower8 ((d >> 8) & 255) << 16; - const u32 w7_t = uint_to_hex_lower8 ((d >> 16) & 255) << 0 - | uint_to_hex_lower8 ((d >> 24) & 255) << 16; + w0_t[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 + | uint_to_hex_lower8 ((a >> 8) & 255) << 16; + w0_t[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 + | uint_to_hex_lower8 ((a >> 24) & 255) << 16; + w0_t[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 + | uint_to_hex_lower8 ((b >> 8) & 255) << 16; + w0_t[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 + | uint_to_hex_lower8 ((b >> 24) & 255) << 16; + w1_t[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 + | uint_to_hex_lower8 ((c >> 8) & 255) << 16; + w1_t[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 + | uint_to_hex_lower8 ((c >> 24) & 255) << 16; + w1_t[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 + | uint_to_hex_lower8 ((d >> 8) & 255) << 16; + w1_t[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 + | uint_to_hex_lower8 ((d >> 24) & 255) << 16; - const u32 w8_t = s[0]; - const u32 w9_t = s[1]; - const u32 wa_t = s[2]; - const u32 wb_t = s[3]; - const u32 wc_t = s[4]; - const u32 wd_t = s[5]; - const u32 we_t = s[6]; - const u32 wf_t = s[7]; + w2_t[0] = s[0]; + w2_t[1] = s[1]; + w2_t[2] = s[2]; + w2_t[3] = s[3]; + + w3_t[0] = s[4]; + w3_t[1] = s[5]; + w3_t[2] = s[6]; + w3_t[3] = s[7]; a = MD5M_A; b = MD5M_B; c = MD5M_C; d = MD5M_D; - MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - const u32 r0 = a; - const u32 r1 = d; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_S + COMPARE_S_SIMD (a, d, c, b); } } diff --git a/OpenCL/m02710_a1.cl b/OpenCL/m02710_a1.cl index d7840fb98..e65a088e8 100644 --- a/OpenCL/m02710_a1.cl +++ b/OpenCL/m02710_a1.cl @@ -70,7 +70,7 @@ __kernel void m02710_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -151,7 +151,7 @@ __kernel void m02710_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -507,7 +507,7 @@ __kernel void m02710_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -600,7 +600,7 @@ __kernel void m02710_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m02710_a3.cl b/OpenCL/m02710_a3.cl index 2fa532aa5..52fee2ad4 100644 --- a/OpenCL/m02710_a3.cl +++ b/OpenCL/m02710_a3.cl @@ -5,6 +5,8 @@ #define _MD5_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,11 +18,17 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" +#include "OpenCL/simd.c" -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" - -#define uint_to_hex_lower8(i) l_bin2asc[(i)] +#if VECT_SIZE == 1 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#endif static void m02710m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 l_bin2asc[256]) { @@ -56,193 +64,222 @@ static void m02710m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32x w0_t[4]; - MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3[3], MD5C0f, MD5S03); + w0_t[0] = w0lr; + w0_t[1] = w0[1]; + w0_t[2] = w0[2]; + w0_t[3] = w0[3]; - MD5_STEP (MD5_Go, a, b, c, d, w0[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3[0], MD5C1f, MD5S13); + u32x w1_t[4]; - MD5_STEP (MD5_H , a, b, c, d, w1[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0[2], MD5C2f, MD5S23); + w1_t[0] = w1[0]; + w1_t[1] = w1[1]; + w1_t[2] = w1[2]; + w1_t[3] = w1[3]; - MD5_STEP (MD5_I , a, b, c, d, w0[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2[1], MD5C3f, MD5S33); + u32x w2_t[4]; + + w2_t[0] = w2[0]; + w2_t[1] = w2[1]; + w2_t[2] = w2[2]; + w2_t[3] = w2[3]; + + u32x w3_t[4]; + + w3_t[0] = w3[0]; + w3_t[1] = w3[1]; + w3_t[2] = w3[2]; + w3_t[3] = w3[3]; + + u32x a = MD5M_A; + u32x b = MD5M_B; + u32x c = MD5M_C; + u32x d = MD5M_D; + + MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); + + MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); + + MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); + + MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); a += MD5M_A; b += MD5M_B; c += MD5M_C; d += MD5M_D; - const u32 w0_t = uint_to_hex_lower8 ((a >> 0) & 255) << 0 - | uint_to_hex_lower8 ((a >> 8) & 255) << 16; - const u32 w1_t = uint_to_hex_lower8 ((a >> 16) & 255) << 0 - | uint_to_hex_lower8 ((a >> 24) & 255) << 16; - const u32 w2_t = uint_to_hex_lower8 ((b >> 0) & 255) << 0 - | uint_to_hex_lower8 ((b >> 8) & 255) << 16; - const u32 w3_t = uint_to_hex_lower8 ((b >> 16) & 255) << 0 - | uint_to_hex_lower8 ((b >> 24) & 255) << 16; - const u32 w4_t = uint_to_hex_lower8 ((c >> 0) & 255) << 0 - | uint_to_hex_lower8 ((c >> 8) & 255) << 16; - const u32 w5_t = uint_to_hex_lower8 ((c >> 16) & 255) << 0 - | uint_to_hex_lower8 ((c >> 24) & 255) << 16; - const u32 w6_t = uint_to_hex_lower8 ((d >> 0) & 255) << 0 - | uint_to_hex_lower8 ((d >> 8) & 255) << 16; - const u32 w7_t = uint_to_hex_lower8 ((d >> 16) & 255) << 0 - | uint_to_hex_lower8 ((d >> 24) & 255) << 16; + w0_t[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 + | uint_to_hex_lower8 ((a >> 8) & 255) << 16; + w0_t[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 + | uint_to_hex_lower8 ((a >> 24) & 255) << 16; + w0_t[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 + | uint_to_hex_lower8 ((b >> 8) & 255) << 16; + w0_t[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 + | uint_to_hex_lower8 ((b >> 24) & 255) << 16; + w1_t[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 + | uint_to_hex_lower8 ((c >> 8) & 255) << 16; + w1_t[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 + | uint_to_hex_lower8 ((c >> 24) & 255) << 16; + w1_t[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 + | uint_to_hex_lower8 ((d >> 8) & 255) << 16; + w1_t[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 + | uint_to_hex_lower8 ((d >> 24) & 255) << 16; - const u32 w8_t = s[0]; - const u32 w9_t = s[1]; - const u32 wa_t = s[2]; - const u32 wb_t = s[3]; - const u32 wc_t = s[4]; - const u32 wd_t = s[5]; - const u32 we_t = s[6]; - const u32 wf_t = s[7]; + w2_t[0] = s[0]; + w2_t[1] = s[1]; + w2_t[2] = s[2]; + w2_t[3] = s[3]; + + w3_t[0] = s[4]; + w3_t[1] = s[5]; + w3_t[2] = s[6]; + w3_t[3] = s[7]; a = MD5M_A; b = MD5M_B; c = MD5M_C; d = MD5M_D; - MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - const u32 r_a = a + MD5M_A; - const u32 r_b = b + MD5M_B; - const u32 r_c = c + MD5M_C; - const u32 r_d = d + MD5M_D; + const u32x r_a = a + MD5M_A; + const u32x r_b = b + MD5M_B; + const u32x r_c = c + MD5M_C; + const u32x r_d = d + MD5M_D; a = r_a; b = r_b; @@ -322,12 +359,7 @@ static void m02710m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le c += r_c; d += r_d; - const u32 r0 = a; - const u32 r1 = d; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_M + COMPARE_M_SIMD (a, d, c, b); } } @@ -377,193 +409,222 @@ static void m02710s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32x w0_t[4]; - MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3[3], MD5C0f, MD5S03); + w0_t[0] = w0lr; + w0_t[1] = w0[1]; + w0_t[2] = w0[2]; + w0_t[3] = w0[3]; - MD5_STEP (MD5_Go, a, b, c, d, w0[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3[0], MD5C1f, MD5S13); + u32x w1_t[4]; - MD5_STEP (MD5_H , a, b, c, d, w1[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0[2], MD5C2f, MD5S23); + w1_t[0] = w1[0]; + w1_t[1] = w1[1]; + w1_t[2] = w1[2]; + w1_t[3] = w1[3]; - MD5_STEP (MD5_I , a, b, c, d, w0[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2[1], MD5C3f, MD5S33); + u32x w2_t[4]; + + w2_t[0] = w2[0]; + w2_t[1] = w2[1]; + w2_t[2] = w2[2]; + w2_t[3] = w2[3]; + + u32x w3_t[4]; + + w3_t[0] = w3[0]; + w3_t[1] = w3[1]; + w3_t[2] = w3[2]; + w3_t[3] = w3[3]; + + u32x a = MD5M_A; + u32x b = MD5M_B; + u32x c = MD5M_C; + u32x d = MD5M_D; + + MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); + + MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); + + MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); + + MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); a += MD5M_A; b += MD5M_B; c += MD5M_C; d += MD5M_D; - const u32 w0_t = uint_to_hex_lower8 ((a >> 0) & 255) << 0 - | uint_to_hex_lower8 ((a >> 8) & 255) << 16; - const u32 w1_t = uint_to_hex_lower8 ((a >> 16) & 255) << 0 - | uint_to_hex_lower8 ((a >> 24) & 255) << 16; - const u32 w2_t = uint_to_hex_lower8 ((b >> 0) & 255) << 0 - | uint_to_hex_lower8 ((b >> 8) & 255) << 16; - const u32 w3_t = uint_to_hex_lower8 ((b >> 16) & 255) << 0 - | uint_to_hex_lower8 ((b >> 24) & 255) << 16; - const u32 w4_t = uint_to_hex_lower8 ((c >> 0) & 255) << 0 - | uint_to_hex_lower8 ((c >> 8) & 255) << 16; - const u32 w5_t = uint_to_hex_lower8 ((c >> 16) & 255) << 0 - | uint_to_hex_lower8 ((c >> 24) & 255) << 16; - const u32 w6_t = uint_to_hex_lower8 ((d >> 0) & 255) << 0 - | uint_to_hex_lower8 ((d >> 8) & 255) << 16; - const u32 w7_t = uint_to_hex_lower8 ((d >> 16) & 255) << 0 - | uint_to_hex_lower8 ((d >> 24) & 255) << 16; + w0_t[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 + | uint_to_hex_lower8 ((a >> 8) & 255) << 16; + w0_t[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 + | uint_to_hex_lower8 ((a >> 24) & 255) << 16; + w0_t[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 + | uint_to_hex_lower8 ((b >> 8) & 255) << 16; + w0_t[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 + | uint_to_hex_lower8 ((b >> 24) & 255) << 16; + w1_t[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 + | uint_to_hex_lower8 ((c >> 8) & 255) << 16; + w1_t[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 + | uint_to_hex_lower8 ((c >> 24) & 255) << 16; + w1_t[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 + | uint_to_hex_lower8 ((d >> 8) & 255) << 16; + w1_t[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 + | uint_to_hex_lower8 ((d >> 24) & 255) << 16; - const u32 w8_t = s[0]; - const u32 w9_t = s[1]; - const u32 wa_t = s[2]; - const u32 wb_t = s[3]; - const u32 wc_t = s[4]; - const u32 wd_t = s[5]; - const u32 we_t = s[6]; - const u32 wf_t = s[7]; + w2_t[0] = s[0]; + w2_t[1] = s[1]; + w2_t[2] = s[2]; + w2_t[3] = s[3]; + + w3_t[0] = s[4]; + w3_t[1] = s[5]; + w3_t[2] = s[6]; + w3_t[3] = s[7]; a = MD5M_A; b = MD5M_B; c = MD5M_C; d = MD5M_D; - MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - const u32 r_a = a + MD5M_A; - const u32 r_b = b + MD5M_B; - const u32 r_c = c + MD5M_C; - const u32 r_d = d + MD5M_D; + const u32x r_a = a + MD5M_A; + const u32x r_b = b + MD5M_B; + const u32x r_c = c + MD5M_C; + const u32x r_d = d + MD5M_D; a = r_a; b = r_b; @@ -635,7 +696,7 @@ static void m02710s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le MD5_STEP0(MD5_I , b, c, d, a, MD5C3b, MD5S33); MD5_STEP0(MD5_I , a, b, c, d, MD5C3c, MD5S30); - if (allx ((a + r_a) != search[0])) continue; + if (MATCHES_NONE_VS ((a + r_a), search[0])) continue; MD5_STEP0(MD5_I , d, a, b, c, MD5C3d, MD5S31); MD5_STEP0(MD5_I , c, d, a, b, MD5C3e, MD5S32); @@ -646,12 +707,7 @@ static void m02710s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le c += r_c; d += r_d; - const u32 r0 = a; - const u32 r1 = d; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_S + COMPARE_S_SIMD (a, d, c, b); } } diff --git a/OpenCL/m02810_a1.cl b/OpenCL/m02810_a1.cl index 52dbf7bd1..4e2e749ea 100644 --- a/OpenCL/m02810_a1.cl +++ b/OpenCL/m02810_a1.cl @@ -70,7 +70,7 @@ __kernel void m02810_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -150,7 +150,7 @@ __kernel void m02810_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -506,7 +506,7 @@ __kernel void m02810_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -598,7 +598,7 @@ __kernel void m02810_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m02810_a3.cl b/OpenCL/m02810_a3.cl index 6aac33200..7a3426070 100644 --- a/OpenCL/m02810_a3.cl +++ b/OpenCL/m02810_a3.cl @@ -5,6 +5,8 @@ #define _MD5_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,11 +18,17 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" +#include "OpenCL/simd.c" -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" - -#define uint_to_hex_lower8(i) l_bin2asc[(i)] +#if VECT_SIZE == 1 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#endif static void m02810m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 l_bin2asc[256]) { @@ -55,193 +63,222 @@ static void m02810m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32x w0_t[4]; - MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3[3], MD5C0f, MD5S03); + w0_t[0] = w0lr; + w0_t[1] = w0[1]; + w0_t[2] = w0[2]; + w0_t[3] = w0[3]; - MD5_STEP (MD5_Go, a, b, c, d, w0[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3[0], MD5C1f, MD5S13); + u32x w1_t[4]; - MD5_STEP (MD5_H , a, b, c, d, w1[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0[2], MD5C2f, MD5S23); + w1_t[0] = w1[0]; + w1_t[1] = w1[1]; + w1_t[2] = w1[2]; + w1_t[3] = w1[3]; - MD5_STEP (MD5_I , a, b, c, d, w0[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2[1], MD5C3f, MD5S33); + u32x w2_t[4]; + + w2_t[0] = w2[0]; + w2_t[1] = w2[1]; + w2_t[2] = w2[2]; + w2_t[3] = w2[3]; + + u32x w3_t[4]; + + w3_t[0] = w3[0]; + w3_t[1] = w3[1]; + w3_t[2] = w3[2]; + w3_t[3] = w3[3]; + + u32x a = MD5M_A; + u32x b = MD5M_B; + u32x c = MD5M_C; + u32x d = MD5M_D; + + MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); + + MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); + + MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); + + MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); a += MD5M_A; b += MD5M_B; c += MD5M_C; d += MD5M_D; - const u32 w0_t = s[0]; - const u32 w1_t = s[1]; - const u32 w2_t = s[2]; - const u32 w3_t = s[3]; - const u32 w4_t = s[4]; - const u32 w5_t = s[5]; - const u32 w6_t = s[6]; - const u32 w7_t = s[7]; + w0_t[0] = s[0]; + w0_t[1] = s[1]; + w0_t[2] = s[2]; + w0_t[3] = s[3]; - const u32 w8_t = uint_to_hex_lower8 ((a >> 0) & 255) << 0 - | uint_to_hex_lower8 ((a >> 8) & 255) << 16; - const u32 w9_t = uint_to_hex_lower8 ((a >> 16) & 255) << 0 - | uint_to_hex_lower8 ((a >> 24) & 255) << 16; - const u32 wa_t = uint_to_hex_lower8 ((b >> 0) & 255) << 0 - | uint_to_hex_lower8 ((b >> 8) & 255) << 16; - const u32 wb_t = uint_to_hex_lower8 ((b >> 16) & 255) << 0 - | uint_to_hex_lower8 ((b >> 24) & 255) << 16; - const u32 wc_t = uint_to_hex_lower8 ((c >> 0) & 255) << 0 - | uint_to_hex_lower8 ((c >> 8) & 255) << 16; - const u32 wd_t = uint_to_hex_lower8 ((c >> 16) & 255) << 0 - | uint_to_hex_lower8 ((c >> 24) & 255) << 16; - const u32 we_t = uint_to_hex_lower8 ((d >> 0) & 255) << 0 - | uint_to_hex_lower8 ((d >> 8) & 255) << 16; - const u32 wf_t = uint_to_hex_lower8 ((d >> 16) & 255) << 0 - | uint_to_hex_lower8 ((d >> 24) & 255) << 16; + w1_t[0] = s[4]; + w1_t[1] = s[5]; + w1_t[2] = s[6]; + w1_t[3] = s[7]; + + w2_t[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 + | uint_to_hex_lower8 ((a >> 8) & 255) << 16; + w2_t[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 + | uint_to_hex_lower8 ((a >> 24) & 255) << 16; + w2_t[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 + | uint_to_hex_lower8 ((b >> 8) & 255) << 16; + w2_t[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 + | uint_to_hex_lower8 ((b >> 24) & 255) << 16; + w3_t[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 + | uint_to_hex_lower8 ((c >> 8) & 255) << 16; + w3_t[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 + | uint_to_hex_lower8 ((c >> 24) & 255) << 16; + w3_t[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 + | uint_to_hex_lower8 ((d >> 8) & 255) << 16; + w3_t[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 + | uint_to_hex_lower8 ((d >> 24) & 255) << 16; a = MD5M_A; b = MD5M_B; c = MD5M_C; d = MD5M_D; - MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - const u32 r_a = a + MD5M_A; - const u32 r_b = b + MD5M_B; - const u32 r_c = c + MD5M_C; - const u32 r_d = d + MD5M_D; + const u32x r_a = a + MD5M_A; + const u32x r_b = b + MD5M_B; + const u32x r_c = c + MD5M_C; + const u32x r_d = d + MD5M_D; a = r_a; b = r_b; @@ -321,12 +358,7 @@ static void m02810m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le c += r_c; d += r_d; - const u32 r0 = a; - const u32 r1 = d; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_M + COMPARE_M_SIMD (a, d, c, b); } } @@ -375,193 +407,222 @@ static void m02810s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32x w0_t[4]; - MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3[3], MD5C0f, MD5S03); + w0_t[0] = w0lr; + w0_t[1] = w0[1]; + w0_t[2] = w0[2]; + w0_t[3] = w0[3]; - MD5_STEP (MD5_Go, a, b, c, d, w0[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3[0], MD5C1f, MD5S13); + u32x w1_t[4]; - MD5_STEP (MD5_H , a, b, c, d, w1[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0[2], MD5C2f, MD5S23); + w1_t[0] = w1[0]; + w1_t[1] = w1[1]; + w1_t[2] = w1[2]; + w1_t[3] = w1[3]; - MD5_STEP (MD5_I , a, b, c, d, w0[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2[1], MD5C3f, MD5S33); + u32x w2_t[4]; + + w2_t[0] = w2[0]; + w2_t[1] = w2[1]; + w2_t[2] = w2[2]; + w2_t[3] = w2[3]; + + u32x w3_t[4]; + + w3_t[0] = w3[0]; + w3_t[1] = w3[1]; + w3_t[2] = w3[2]; + w3_t[3] = w3[3]; + + u32x a = MD5M_A; + u32x b = MD5M_B; + u32x c = MD5M_C; + u32x d = MD5M_D; + + MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); + + MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); + + MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); + + MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); a += MD5M_A; b += MD5M_B; c += MD5M_C; d += MD5M_D; - const u32 w0_t = s[0]; - const u32 w1_t = s[1]; - const u32 w2_t = s[2]; - const u32 w3_t = s[3]; - const u32 w4_t = s[4]; - const u32 w5_t = s[5]; - const u32 w6_t = s[6]; - const u32 w7_t = s[7]; + w0_t[0] = s[0]; + w0_t[1] = s[1]; + w0_t[2] = s[2]; + w0_t[3] = s[3]; - const u32 w8_t = uint_to_hex_lower8 ((a >> 0) & 255) << 0 - | uint_to_hex_lower8 ((a >> 8) & 255) << 16; - const u32 w9_t = uint_to_hex_lower8 ((a >> 16) & 255) << 0 - | uint_to_hex_lower8 ((a >> 24) & 255) << 16; - const u32 wa_t = uint_to_hex_lower8 ((b >> 0) & 255) << 0 - | uint_to_hex_lower8 ((b >> 8) & 255) << 16; - const u32 wb_t = uint_to_hex_lower8 ((b >> 16) & 255) << 0 - | uint_to_hex_lower8 ((b >> 24) & 255) << 16; - const u32 wc_t = uint_to_hex_lower8 ((c >> 0) & 255) << 0 - | uint_to_hex_lower8 ((c >> 8) & 255) << 16; - const u32 wd_t = uint_to_hex_lower8 ((c >> 16) & 255) << 0 - | uint_to_hex_lower8 ((c >> 24) & 255) << 16; - const u32 we_t = uint_to_hex_lower8 ((d >> 0) & 255) << 0 - | uint_to_hex_lower8 ((d >> 8) & 255) << 16; - const u32 wf_t = uint_to_hex_lower8 ((d >> 16) & 255) << 0 - | uint_to_hex_lower8 ((d >> 24) & 255) << 16; + w1_t[0] = s[4]; + w1_t[1] = s[5]; + w1_t[2] = s[6]; + w1_t[3] = s[7]; + + w2_t[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 + | uint_to_hex_lower8 ((a >> 8) & 255) << 16; + w2_t[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 + | uint_to_hex_lower8 ((a >> 24) & 255) << 16; + w2_t[2] = uint_to_hex_lower8 ((b >> 0) & 255) << 0 + | uint_to_hex_lower8 ((b >> 8) & 255) << 16; + w2_t[3] = uint_to_hex_lower8 ((b >> 16) & 255) << 0 + | uint_to_hex_lower8 ((b >> 24) & 255) << 16; + w3_t[0] = uint_to_hex_lower8 ((c >> 0) & 255) << 0 + | uint_to_hex_lower8 ((c >> 8) & 255) << 16; + w3_t[1] = uint_to_hex_lower8 ((c >> 16) & 255) << 0 + | uint_to_hex_lower8 ((c >> 24) & 255) << 16; + w3_t[2] = uint_to_hex_lower8 ((d >> 0) & 255) << 0 + | uint_to_hex_lower8 ((d >> 8) & 255) << 16; + w3_t[3] = uint_to_hex_lower8 ((d >> 16) & 255) << 0 + | uint_to_hex_lower8 ((d >> 24) & 255) << 16; a = MD5M_A; b = MD5M_B; c = MD5M_C; d = MD5M_D; - MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - const u32 r_a = a + MD5M_A; - const u32 r_b = b + MD5M_B; - const u32 r_c = c + MD5M_C; - const u32 r_d = d + MD5M_D; + const u32x r_a = a + MD5M_A; + const u32x r_b = b + MD5M_B; + const u32x r_c = c + MD5M_C; + const u32x r_d = d + MD5M_D; a = r_a; b = r_b; @@ -633,7 +694,7 @@ static void m02810s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le MD5_STEP0(MD5_I , b, c, d, a, MD5C3b, MD5S33); MD5_STEP0(MD5_I , a, b, c, d, MD5C3c, MD5S30); - if (allx ((a + r_a) != search[0])) continue; + if (MATCHES_NONE_VS ((a + r_a), search[0])) continue; MD5_STEP0(MD5_I , d, a, b, c, MD5C3d, MD5S31); MD5_STEP0(MD5_I , c, d, a, b, MD5C3e, MD5S32); @@ -644,12 +705,7 @@ static void m02810s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le c += r_c; d += r_d; - const u32 r0 = a; - const u32 r1 = d; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_S + COMPARE_S_SIMD (a, d, c, b); } } diff --git a/OpenCL/m03000_a1.cl b/OpenCL/m03000_a1.cl index 817c7791a..39060ce96 100644 --- a/OpenCL/m03000_a1.cl +++ b/OpenCL/m03000_a1.cl @@ -515,7 +515,7 @@ __kernel void m03000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -592,7 +592,7 @@ __kernel void m03000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -707,7 +707,7 @@ __kernel void m03000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -796,7 +796,7 @@ __kernel void m03000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m03100_a0.cl b/OpenCL/m03100_a0.cl index 230b45f72..6b1a2f916 100644 --- a/OpenCL/m03100_a0.cl +++ b/OpenCL/m03100_a0.cl @@ -632,7 +632,7 @@ __kernel void m03100_m04 (__global pw_t *pws, __global kernel_rule_t * rules_bu w3_t[2] = w3[2]; w3_t[3] = w3[3]; - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w0_t[0] |= salt_buf0[0]; w0_t[1] |= salt_buf0[1]; @@ -934,7 +934,7 @@ __kernel void m03100_s04 (__global pw_t *pws, __global kernel_rule_t * rules_bu w3_t[2] = w3[2]; w3_t[3] = w3[3]; - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w0_t[0] |= salt_buf0[0]; w0_t[1] |= salt_buf0[1]; diff --git a/OpenCL/m03100_a1.cl b/OpenCL/m03100_a1.cl index cbcdd3e12..4ac924bce 100644 --- a/OpenCL/m03100_a1.cl +++ b/OpenCL/m03100_a1.cl @@ -556,7 +556,7 @@ __kernel void m03100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -621,7 +621,7 @@ __kernel void m03100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -678,7 +678,7 @@ __kernel void m03100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w3_t[2] = w3[2]; w3_t[3] = w3[3]; - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w0_t[0] |= salt_buf0[0]; w0_t[1] |= salt_buf0[1]; @@ -894,7 +894,7 @@ __kernel void m03100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -971,7 +971,7 @@ __kernel void m03100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -1028,7 +1028,7 @@ __kernel void m03100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w3_t[2] = w3[2]; w3_t[3] = w3[3]; - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w0_t[0] |= salt_buf0[0]; w0_t[1] |= salt_buf0[1]; diff --git a/OpenCL/m03100_a3.cl b/OpenCL/m03100_a3.cl index 1689cbb7c..8e39347f9 100644 --- a/OpenCL/m03100_a3.cl +++ b/OpenCL/m03100_a3.cl @@ -1,10 +1,12 @@ -/** +/** / s_skb * Author......: Jens Steube * License.....: MIT */ #define _DES_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,9 +18,7 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" - -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" +#include "OpenCL/simd.c" #define PERM_OP(a,b,tt,n,m) \ { \ @@ -354,14 +354,22 @@ __constant u32 c_skb[8][64] = } }; +#if VECT_SIZE == 1 #define BOX(i,n,S) (S)[(n)][(i)] +#elif VECT_SIZE == 2 +#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1]) +#elif VECT_SIZE == 4 +#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3]) +#elif VECT_SIZE == 8 +#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7]) +#endif -static void _des_crypt_encrypt (u32 iv[2], u32 data[2], u32 Kc[16], u32 Kd[16], __local u32 s_SPtrans[8][64]) +static void _des_crypt_encrypt (u32x iv[2], u32x data[2], u32x Kc[16], u32x Kd[16], __local u32 s_SPtrans[8][64]) { - u32 tt; + u32x tt; - u32 r = data[0]; - u32 l = data[1]; + u32x r = data[0]; + u32x l = data[1]; IP (r, l, tt); @@ -371,8 +379,8 @@ static void _des_crypt_encrypt (u32 iv[2], u32 data[2], u32 Kc[16], u32 Kd[16], #pragma unroll 16 for (u32 i = 0; i < 16; i += 2) { - u32 u; - u32 t; + u32x u; + u32x t; u = Kc[i + 0] ^ r; t = Kd[i + 0] ^ rotl32 (r, 28u); @@ -408,9 +416,9 @@ static void _des_crypt_encrypt (u32 iv[2], u32 data[2], u32 Kc[16], u32 Kd[16], iv[1] = r; } -static void _des_crypt_keysetup (u32 c, u32 d, u32 Kc[16], u32 Kd[16], __local u32 s_skb[8][64]) +static void _des_crypt_keysetup (u32x c, u32x d, u32x Kc[16], u32x Kd[16], __local u32 s_skb[8][64]) { - u32 tt; + u32x tt; PERM_OP (d, c, tt, 4, 0x0f0f0f0f); HPERM_OP (c, tt, 2, 0xcccc0000); @@ -443,32 +451,32 @@ static void _des_crypt_keysetup (u32 c, u32 d, u32 Kc[16], u32 Kd[16], __local u c = c & 0x0fffffff; d = d & 0x0fffffff; - const u32 c00 = (c >> 0) & 0x0000003f; - const u32 c06 = (c >> 6) & 0x00383003; - const u32 c07 = (c >> 7) & 0x0000003c; - const u32 c13 = (c >> 13) & 0x0000060f; - const u32 c20 = (c >> 20) & 0x00000001; + const u32x c00 = (c >> 0) & 0x0000003f; + const u32x c06 = (c >> 6) & 0x00383003; + const u32x c07 = (c >> 7) & 0x0000003c; + const u32x c13 = (c >> 13) & 0x0000060f; + const u32x c20 = (c >> 20) & 0x00000001; - u32 s = BOX (((c00 >> 0) & 0xff), 0, s_skb) - | BOX (((c06 >> 0) & 0xff) - |((c07 >> 0) & 0xff), 1, s_skb) - | BOX (((c13 >> 0) & 0xff) - |((c06 >> 8) & 0xff), 2, s_skb) - | BOX (((c20 >> 0) & 0xff) - |((c13 >> 8) & 0xff) - |((c06 >> 16) & 0xff), 3, s_skb); + u32x s = BOX (((c00 >> 0) & 0xff), 0, s_skb) + | BOX (((c06 >> 0) & 0xff) + |((c07 >> 0) & 0xff), 1, s_skb) + | BOX (((c13 >> 0) & 0xff) + |((c06 >> 8) & 0xff), 2, s_skb) + | BOX (((c20 >> 0) & 0xff) + |((c13 >> 8) & 0xff) + |((c06 >> 16) & 0xff), 3, s_skb); - const u32 d00 = (d >> 0) & 0x00003c3f; - const u32 d07 = (d >> 7) & 0x00003f03; - const u32 d21 = (d >> 21) & 0x0000000f; - const u32 d22 = (d >> 22) & 0x00000030; + const u32x d00 = (d >> 0) & 0x00003c3f; + const u32x d07 = (d >> 7) & 0x00003f03; + const u32x d21 = (d >> 21) & 0x0000000f; + const u32x d22 = (d >> 22) & 0x00000030; - u32 t = BOX (((d00 >> 0) & 0xff), 4, s_skb) - | BOX (((d07 >> 0) & 0xff) - |((d00 >> 8) & 0xff), 5, s_skb) - | BOX (((d07 >> 8) & 0xff), 6, s_skb) - | BOX (((d21 >> 0) & 0xff) - |((d22 >> 0) & 0xff), 7, s_skb); + u32x t = BOX (((d00 >> 0) & 0xff), 4, s_skb) + | BOX (((d07 >> 0) & 0xff) + |((d00 >> 8) & 0xff), 5, s_skb) + | BOX (((d07 >> 8) & 0xff), 6, s_skb) + | BOX (((d21 >> 0) & 0xff) + |((d22 >> 0) & 0xff), 7, s_skb); Kc[i] = ((t << 16) | (s & 0x0000ffff)); Kd[i] = ((s >> 16) | (t & 0xffff0000)); @@ -478,196 +486,7 @@ static void _des_crypt_keysetup (u32 c, u32 d, u32 Kc[16], u32 Kd[16], __local u } } -static void overwrite_at (u32 sw[16], const u32 w0, const u32 salt_len) -{ - #if defined cl_amd_media_ops - switch (salt_len) - { - case 0: sw[0] = w0; - break; - case 1: sw[0] = amd_bytealign (w0, sw[0] << 24, 3); - sw[1] = amd_bytealign (sw[1] >> 8, w0, 3); - break; - case 2: sw[0] = amd_bytealign (w0, sw[0] << 16, 2); - sw[1] = amd_bytealign (sw[1] >> 16, w0, 2); - break; - case 3: sw[0] = amd_bytealign (w0, sw[0] << 8, 1); - sw[1] = amd_bytealign (sw[1] >> 24, w0, 1); - break; - case 4: sw[1] = w0; - break; - case 5: sw[1] = amd_bytealign (w0, sw[1] << 24, 3); - sw[2] = amd_bytealign (sw[2] >> 8, w0, 3); - break; - case 6: sw[1] = amd_bytealign (w0, sw[1] << 16, 2); - sw[2] = amd_bytealign (sw[2] >> 16, w0, 2); - break; - case 7: sw[1] = amd_bytealign (w0, sw[1] << 8, 1); - sw[2] = amd_bytealign (sw[2] >> 24, w0, 1); - break; - case 8: sw[2] = w0; - break; - case 9: sw[2] = amd_bytealign (w0, sw[2] << 24, 3); - sw[3] = amd_bytealign (sw[3] >> 8, w0, 3); - break; - case 10: sw[2] = amd_bytealign (w0, sw[2] << 16, 2); - sw[3] = amd_bytealign (sw[3] >> 16, w0, 2); - break; - case 11: sw[2] = amd_bytealign (w0, sw[2] << 8, 1); - sw[3] = amd_bytealign (sw[3] >> 24, w0, 1); - break; - case 12: sw[3] = w0; - break; - case 13: sw[3] = amd_bytealign (w0, sw[3] << 24, 3); - sw[4] = amd_bytealign (sw[4] >> 8, w0, 3); - break; - case 14: sw[3] = amd_bytealign (w0, sw[3] << 16, 2); - sw[4] = amd_bytealign (sw[4] >> 16, w0, 2); - break; - case 15: sw[3] = amd_bytealign (w0, sw[3] << 8, 1); - sw[4] = amd_bytealign (sw[4] >> 24, w0, 1); - break; - case 16: sw[4] = w0; - break; - case 17: sw[4] = amd_bytealign (w0, sw[4] << 24, 3); - sw[5] = amd_bytealign (sw[5] >> 8, w0, 3); - break; - case 18: sw[4] = amd_bytealign (w0, sw[4] << 16, 2); - sw[5] = amd_bytealign (sw[5] >> 16, w0, 2); - break; - case 19: sw[4] = amd_bytealign (w0, sw[4] << 8, 1); - sw[5] = amd_bytealign (sw[5] >> 24, w0, 1); - break; - case 20: sw[5] = w0; - break; - case 21: sw[5] = amd_bytealign (w0, sw[5] << 24, 3); - sw[6] = amd_bytealign (sw[6] >> 8, w0, 3); - break; - case 22: sw[5] = amd_bytealign (w0, sw[5] << 16, 2); - sw[6] = amd_bytealign (sw[6] >> 16, w0, 2); - break; - case 23: sw[5] = amd_bytealign (w0, sw[5] << 8, 1); - sw[6] = amd_bytealign (sw[6] >> 24, w0, 1); - break; - case 24: sw[6] = w0; - break; - case 25: sw[6] = amd_bytealign (w0, sw[6] << 24, 3); - sw[7] = amd_bytealign (sw[7] >> 8, w0, 3); - break; - case 26: sw[6] = amd_bytealign (w0, sw[6] << 16, 2); - sw[7] = amd_bytealign (sw[7] >> 16, w0, 2); - break; - case 27: sw[6] = amd_bytealign (w0, sw[6] << 8, 1); - sw[7] = amd_bytealign (sw[7] >> 24, w0, 1); - break; - case 28: sw[7] = w0; - break; - case 29: sw[7] = amd_bytealign (w0, sw[7] << 24, 3); - sw[8] = amd_bytealign (sw[8] >> 8, w0, 3); - break; - case 30: sw[7] = amd_bytealign (w0, sw[7] << 16, 2); - sw[8] = amd_bytealign (sw[8] >> 16, w0, 2); - break; - case 31: sw[7] = amd_bytealign (w0, sw[7] << 8, 1); - sw[8] = amd_bytealign (sw[8] >> 24, w0, 1); - break; - } - #else - switch (salt_len) - { - case 0: sw[0] = w0; - break; - case 1: sw[0] = (sw[0] & 0x000000ff) | (w0 << 8); - sw[1] = (sw[1] & 0xffffff00) | (w0 >> 24); - break; - case 2: sw[0] = (sw[0] & 0x0000ffff) | (w0 << 16); - sw[1] = (sw[1] & 0xffff0000) | (w0 >> 16); - break; - case 3: sw[0] = (sw[0] & 0x00ffffff) | (w0 << 24); - sw[1] = (sw[1] & 0xff000000) | (w0 >> 8); - break; - case 4: sw[1] = w0; - break; - case 5: sw[1] = (sw[1] & 0x000000ff) | (w0 << 8); - sw[2] = (sw[2] & 0xffffff00) | (w0 >> 24); - break; - case 6: sw[1] = (sw[1] & 0x0000ffff) | (w0 << 16); - sw[2] = (sw[2] & 0xffff0000) | (w0 >> 16); - break; - case 7: sw[1] = (sw[1] & 0x00ffffff) | (w0 << 24); - sw[2] = (sw[2] & 0xff000000) | (w0 >> 8); - break; - case 8: sw[2] = w0; - break; - case 9: sw[2] = (sw[2] & 0x000000ff) | (w0 << 8); - sw[3] = (sw[3] & 0xffffff00) | (w0 >> 24); - break; - case 10: sw[2] = (sw[2] & 0x0000ffff) | (w0 << 16); - sw[3] = (sw[3] & 0xffff0000) | (w0 >> 16); - break; - case 11: sw[2] = (sw[2] & 0x00ffffff) | (w0 << 24); - sw[3] = (sw[3] & 0xff000000) | (w0 >> 8); - break; - case 12: sw[3] = w0; - break; - case 13: sw[3] = (sw[3] & 0x000000ff) | (w0 << 8); - sw[4] = (sw[4] & 0xffffff00) | (w0 >> 24); - break; - case 14: sw[3] = (sw[3] & 0x0000ffff) | (w0 << 16); - sw[4] = (sw[4] & 0xffff0000) | (w0 >> 16); - break; - case 15: sw[3] = (sw[3] & 0x00ffffff) | (w0 << 24); - sw[4] = (sw[4] & 0xff000000) | (w0 >> 8); - break; - case 16: sw[4] = w0; - break; - case 17: sw[4] = (sw[4] & 0x000000ff) | (w0 << 8); - sw[5] = (sw[5] & 0xffffff00) | (w0 >> 24); - break; - case 18: sw[4] = (sw[4] & 0x0000ffff) | (w0 << 16); - sw[5] = (sw[5] & 0xffff0000) | (w0 >> 16); - break; - case 19: sw[4] = (sw[4] & 0x00ffffff) | (w0 << 24); - sw[5] = (sw[5] & 0xff000000) | (w0 >> 8); - break; - case 20: sw[5] = w0; - break; - case 21: sw[5] = (sw[5] & 0x000000ff) | (w0 << 8); - sw[6] = (sw[6] & 0xffffff00) | (w0 >> 24); - break; - case 22: sw[5] = (sw[5] & 0x0000ffff) | (w0 << 16); - sw[6] = (sw[6] & 0xffff0000) | (w0 >> 16); - break; - case 23: sw[5] = (sw[5] & 0x00ffffff) | (w0 << 24); - sw[6] = (sw[6] & 0xff000000) | (w0 >> 8); - break; - case 24: sw[6] = w0; - break; - case 25: sw[6] = (sw[6] & 0x000000ff) | (w0 << 8); - sw[7] = (sw[7] & 0xffffff00) | (w0 >> 24); - break; - case 26: sw[6] = (sw[6] & 0x0000ffff) | (w0 << 16); - sw[7] = (sw[7] & 0xffff0000) | (w0 >> 16); - break; - case 27: sw[6] = (sw[6] & 0x00ffffff) | (w0 << 24); - sw[7] = (sw[7] & 0xff000000) | (w0 >> 8); - break; - case 28: sw[7] = w0; - break; - case 29: sw[7] = (sw[7] & 0x000000ff) | (w0 << 8); - sw[8] = (sw[8] & 0xffffff00) | (w0 >> 24); - break; - case 30: sw[7] = (sw[7] & 0x0000ffff) | (w0 << 16); - sw[8] = (sw[8] & 0xffff0000) | (w0 >> 16); - break; - case 31: sw[7] = (sw[7] & 0x00ffffff) | (w0 << 24); - sw[8] = (sw[8] & 0xff000000) | (w0 >> 8); - break; - } - #endif -} - -static void m03100m (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m03100m (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -731,7 +550,7 @@ static void m03100m (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 w3_t[2] = w[14]; w3_t[3] = w[15]; - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le_S (w0_t, w1_t, w2_t, w3_t, salt_len); w0_t[0] |= salt_buf0[0]; w0_t[1] |= salt_buf0[1]; @@ -750,7 +569,7 @@ static void m03100m (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 w3_t[2] = 0; w3_t[3] = 0; - u32 dst[16]; + u32x dst[16]; dst[ 0] = w0_t[0]; dst[ 1] = w0_t[1]; @@ -775,20 +594,20 @@ static void m03100m (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - overwrite_at (dst, w0, salt_len); + overwrite_at_le (dst, w0, salt_len); /** * precompute key1 since key is static: 0x0123456789abcdef * plus LEFT_ROTATE by 2 */ - u32 Kc[16]; + u32x Kc[16]; Kc[ 0] = 0x64649040; Kc[ 1] = 0x14909858; @@ -807,7 +626,7 @@ static void m03100m (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 Kc[14] = 0x584020b4; Kc[15] = 0x00742c4c; - u32 Kd[16]; + u32x Kd[16]; Kd[ 0] = 0xa42ce40c; Kd[ 1] = 0x64689858; @@ -830,14 +649,14 @@ static void m03100m (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 * key1 (generate key) */ - u32 iv[2]; + u32x iv[2]; iv[0] = 0; iv[1] = 0; for (u32 j = 0, k = 0; j < salt_word_len; j += 8, k++) { - u32 data[2]; + u32x data[2]; data[0] = ((dst[k] << 16) & 0xff000000) | ((dst[k] << 8) & 0x0000ff00); data[1] = ((dst[k] >> 0) & 0xff000000) | ((dst[k] >> 8) & 0x0000ff00); @@ -859,7 +678,7 @@ static void m03100m (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 for (u32 j = 0, k = 0; j < salt_word_len; j += 8, k++) { - u32 data[2]; + u32x data[2]; data[0] = ((dst[k] << 16) & 0xff000000) | ((dst[k] << 8) & 0x0000ff00); data[1] = ((dst[k] >> 0) & 0xff000000) | ((dst[k] >> 8) & 0x0000ff00); @@ -874,16 +693,14 @@ static void m03100m (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 * cmp */ - const u32 r0 = iv[0]; - const u32 r1 = iv[1]; - const u32 r2 = 0; - const u32 r3 = 0; + u32x c = 0; + u32x d = 0; - #include COMPARE_M + COMPARE_M_SIMD (iv[0], iv[1], c, d); } } -static void m03100s (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m03100s (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -947,7 +764,7 @@ static void m03100s (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 w3_t[2] = w[14]; w3_t[3] = w[15]; - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le_S (w0_t, w1_t, w2_t, w3_t, salt_len); w0_t[0] |= salt_buf0[0]; w0_t[1] |= salt_buf0[1]; @@ -966,7 +783,7 @@ static void m03100s (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 w3_t[2] = 0; w3_t[3] = 0; - u32 dst[16]; + u32x dst[16]; dst[ 0] = w0_t[0]; dst[ 1] = w0_t[1]; @@ -1003,20 +820,20 @@ static void m03100s (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - overwrite_at (dst, w0, salt_len); + overwrite_at_le (dst, w0, salt_len); /** * precompute key1 since key is static: 0x0123456789abcdef * plus LEFT_ROTATE by 2 */ - u32 Kc[16]; + u32x Kc[16]; Kc[ 0] = 0x64649040; Kc[ 1] = 0x14909858; @@ -1035,7 +852,7 @@ static void m03100s (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 Kc[14] = 0x584020b4; Kc[15] = 0x00742c4c; - u32 Kd[16]; + u32x Kd[16]; Kd[ 0] = 0xa42ce40c; Kd[ 1] = 0x64689858; @@ -1058,14 +875,14 @@ static void m03100s (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 * key1 (generate key) */ - u32 iv[2]; + u32x iv[2]; iv[0] = 0; iv[1] = 0; for (u32 j = 0, k = 0; j < salt_word_len; j += 8, k++) { - u32 data[2]; + u32x data[2]; data[0] = ((dst[k] << 16) & 0xff000000) | ((dst[k] << 8) & 0x0000ff00); data[1] = ((dst[k] >> 0) & 0xff000000) | ((dst[k] >> 8) & 0x0000ff00); @@ -1087,7 +904,7 @@ static void m03100s (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 for (u32 j = 0, k = 0; j < salt_word_len; j += 8, k++) { - u32 data[2]; + u32x data[2]; data[0] = ((dst[k] << 16) & 0xff000000) | ((dst[k] << 8) & 0x0000ff00); data[1] = ((dst[k] >> 0) & 0xff000000) | ((dst[k] >> 8) & 0x0000ff00); @@ -1102,16 +919,14 @@ static void m03100s (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 * cmp */ - const u32 r0 = iv[0]; - const u32 r1 = iv[1]; - const u32 r2 = 0; - const u32 r3 = 0; + u32x c = 0; + u32x d = 0; - #include COMPARE_S + COMPARE_S_SIMD (iv[0], iv[1], c, d); } } -__kernel void m03100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m03100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * modifier @@ -1185,7 +1000,7 @@ __kernel void m03100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m03100m (s_SPtrans, s_skb, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m03100_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m03100_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * modifier @@ -1259,11 +1074,11 @@ __kernel void m03100_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m03100m (s_SPtrans, s_skb, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m03100_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m03100_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { } -__kernel void m03100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m03100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * modifier @@ -1337,7 +1152,7 @@ __kernel void m03100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m03100s (s_SPtrans, s_skb, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m03100_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m03100_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * modifier @@ -1411,6 +1226,6 @@ __kernel void m03100_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m03100s (s_SPtrans, s_skb, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m03100_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m03100_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { } diff --git a/OpenCL/m03710_a0.cl b/OpenCL/m03710_a0.cl index 9270359bf..ad980bfa1 100644 --- a/OpenCL/m03710_a0.cl +++ b/OpenCL/m03710_a0.cl @@ -263,7 +263,7 @@ __kernel void m03710_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * prepend salt */ - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w3_t[2] = pw_salt_len * 8; @@ -629,7 +629,7 @@ __kernel void m03710_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * prepend salt */ - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w3_t[2] = pw_salt_len * 8; diff --git a/OpenCL/m03710_a1.cl b/OpenCL/m03710_a1.cl index 35d8263fd..d9380daa4 100644 --- a/OpenCL/m03710_a1.cl +++ b/OpenCL/m03710_a1.cl @@ -89,7 +89,7 @@ __kernel void m03710_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -168,7 +168,7 @@ __kernel void m03710_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -313,7 +313,7 @@ __kernel void m03710_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * prepend salt */ - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w3_t[2] = pw_salt_len * 8; @@ -495,7 +495,7 @@ __kernel void m03710_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -586,7 +586,7 @@ __kernel void m03710_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -731,7 +731,7 @@ __kernel void m03710_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * prepend salt */ - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w3_t[2] = pw_salt_len * 8; diff --git a/OpenCL/m03710_a3.cl b/OpenCL/m03710_a3.cl index 113bed720..bf47cc4ef 100644 --- a/OpenCL/m03710_a3.cl +++ b/OpenCL/m03710_a3.cl @@ -5,6 +5,8 @@ #define _MD5_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,11 +18,17 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" +#include "OpenCL/simd.c" -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" - -#define uint_to_hex_lower8(i) l_bin2asc[(i)] +#if VECT_SIZE == 1 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#endif static void m03710m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 l_bin2asc[256]) { @@ -73,95 +81,118 @@ static void m03710m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32x w0_t[4]; - MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3[3], MD5C0f, MD5S03); + w0_t[0] = w0lr; + w0_t[1] = w0[1]; + w0_t[2] = w0[2]; + w0_t[3] = w0[3]; - MD5_STEP (MD5_Go, a, b, c, d, w0[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3[0], MD5C1f, MD5S13); + u32x w1_t[4]; - MD5_STEP (MD5_H , a, b, c, d, w1[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0[2], MD5C2f, MD5S23); + w1_t[0] = w1[0]; + w1_t[1] = w1[1]; + w1_t[2] = w1[2]; + w1_t[3] = w1[3]; - MD5_STEP (MD5_I , a, b, c, d, w0[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2[1], MD5C3f, MD5S33); + u32x w2_t[4]; + + w2_t[0] = w2[0]; + w2_t[1] = w2[1]; + w2_t[2] = w2[2]; + w2_t[3] = w2[3]; + + u32x w3_t[4]; + + w3_t[0] = w3[0]; + w3_t[1] = w3[1]; + w3_t[2] = w3[2]; + w3_t[3] = w3[3]; + + u32x a = MD5M_A; + u32x b = MD5M_B; + u32x c = MD5M_C; + u32x d = MD5M_D; + + MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); + + MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); + + MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); + + MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); a += MD5M_A; b += MD5M_B; c += MD5M_C; d += MD5M_D; - u32 w0_t[4]; - u32 w1_t[4]; - u32 w2_t[4]; - u32 w3_t[4]; - w0_t[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 | uint_to_hex_lower8 ((a >> 8) & 255) << 16; w0_t[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 @@ -193,7 +224,7 @@ static void m03710m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * prepend salt */ - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w3_t[2] = pw_salt_len * 8; @@ -291,12 +322,7 @@ static void m03710m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - const u32 r0 = a; - const u32 r1 = d; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_M + COMPARE_M_SIMD (a, d, c, b); } } @@ -363,95 +389,118 @@ static void m03710s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32x w0_t[4]; - MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3[3], MD5C0f, MD5S03); + w0_t[0] = w0lr; + w0_t[1] = w0[1]; + w0_t[2] = w0[2]; + w0_t[3] = w0[3]; - MD5_STEP (MD5_Go, a, b, c, d, w0[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3[0], MD5C1f, MD5S13); + u32x w1_t[4]; - MD5_STEP (MD5_H , a, b, c, d, w1[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0[2], MD5C2f, MD5S23); + w1_t[0] = w1[0]; + w1_t[1] = w1[1]; + w1_t[2] = w1[2]; + w1_t[3] = w1[3]; - MD5_STEP (MD5_I , a, b, c, d, w0[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2[1], MD5C3f, MD5S33); + u32x w2_t[4]; + + w2_t[0] = w2[0]; + w2_t[1] = w2[1]; + w2_t[2] = w2[2]; + w2_t[3] = w2[3]; + + u32x w3_t[4]; + + w3_t[0] = w3[0]; + w3_t[1] = w3[1]; + w3_t[2] = w3[2]; + w3_t[3] = w3[3]; + + u32x a = MD5M_A; + u32x b = MD5M_B; + u32x c = MD5M_C; + u32x d = MD5M_D; + + MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); + + MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); + + MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); + + MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); a += MD5M_A; b += MD5M_B; c += MD5M_C; d += MD5M_D; - u32 w0_t[4]; - u32 w1_t[4]; - u32 w2_t[4]; - u32 w3_t[4]; - w0_t[0] = uint_to_hex_lower8 ((a >> 0) & 255) << 0 | uint_to_hex_lower8 ((a >> 8) & 255) << 16; w0_t[1] = uint_to_hex_lower8 ((a >> 16) & 255) << 0 @@ -483,7 +532,7 @@ static void m03710s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * prepend salt */ - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w3_t[2] = pw_salt_len * 8; @@ -581,12 +630,7 @@ static void m03710s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - const u32 r0 = a; - const u32 r1 = d; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_S + COMPARE_S_SIMD (a, d, c, b); } } diff --git a/OpenCL/m03800_a0.cl b/OpenCL/m03800_a0.cl index 199f761e1..0845c4ab6 100644 --- a/OpenCL/m03800_a0.cl +++ b/OpenCL/m03800_a0.cl @@ -155,7 +155,7 @@ __kernel void m03800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * prepend salt */ - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w0_t[0] |= salt_buf0[0]; w0_t[1] |= salt_buf0[1]; @@ -206,7 +206,7 @@ __kernel void m03800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, salt_len + out_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, salt_len + out_len); w0_t[0] |= s0[0]; w0_t[1] |= s0[1]; @@ -471,7 +471,7 @@ __kernel void m03800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * prepend salt */ - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w0_t[0] |= salt_buf0[0]; w0_t[1] |= salt_buf0[1]; @@ -522,7 +522,7 @@ __kernel void m03800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, salt_len + out_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, salt_len + out_len); w0_t[0] |= s0[0]; w0_t[1] |= s0[1]; diff --git a/OpenCL/m03800_a1.cl b/OpenCL/m03800_a1.cl index 1b4b05c54..fe2fdf64d 100644 --- a/OpenCL/m03800_a1.cl +++ b/OpenCL/m03800_a1.cl @@ -66,7 +66,7 @@ __kernel void m03800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -143,7 +143,7 @@ __kernel void m03800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -206,7 +206,7 @@ __kernel void m03800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w3_t[2] = w3[2]; w3_t[3] = w3[3]; - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w0_t[0] |= salt_buf0[0]; w0_t[1] |= salt_buf0[1]; @@ -257,7 +257,7 @@ __kernel void m03800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, salt_len + pw_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, salt_len + pw_len); w0_t[0] |= s0[0]; w0_t[1] |= s0[1]; @@ -423,7 +423,7 @@ __kernel void m03800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -512,7 +512,7 @@ __kernel void m03800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -575,7 +575,7 @@ __kernel void m03800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w3_t[2] = w3[2]; w3_t[3] = w3[3]; - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w0_t[0] |= salt_buf0[0]; w0_t[1] |= salt_buf0[1]; @@ -626,7 +626,7 @@ __kernel void m03800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, salt_len + pw_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, salt_len + pw_len); w0_t[0] |= s0[0]; w0_t[1] |= s0[1]; diff --git a/OpenCL/m03800_a3.cl b/OpenCL/m03800_a3.cl index c784f878c..502295c96 100644 --- a/OpenCL/m03800_a3.cl +++ b/OpenCL/m03800_a3.cl @@ -5,6 +5,8 @@ #define _MD5_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,9 +18,7 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" - -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" +#include "OpenCL/simd.c" static void m03800m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { @@ -75,34 +75,34 @@ static void m03800m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - u32 w0_t[4]; + u32x w0_t[4]; - w0_t[0] = w0[0]; + w0_t[0] = w0lr; w0_t[1] = w0[1]; w0_t[2] = w0[2]; w0_t[3] = w0[3]; - u32 w1_t[4]; + u32x w1_t[4]; w1_t[0] = w1[0]; w1_t[1] = w1[1]; w1_t[2] = w1[2]; w1_t[3] = w1[3]; - u32 w2_t[4]; + u32x w2_t[4]; w2_t[0] = w2[0]; w2_t[1] = w2[1]; w2_t[2] = w2[2]; w2_t[3] = w2[3]; - u32 w3_t[4]; + u32x w3_t[4]; w3_t[0] = w3[0]; w3_t[1] = w3[1]; @@ -114,7 +114,7 @@ static void m03800m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * prepend salt */ - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w0_t[0] |= salt_buf0[0]; w0_t[1] |= salt_buf0[1]; @@ -137,35 +137,35 @@ static void m03800m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * append salt */ - u32 s0[4]; + u32x s0[4]; s0[0] = salt_buf0[0]; s0[1] = salt_buf0[1]; s0[2] = salt_buf0[2]; s0[3] = salt_buf0[3]; - u32 s1[4]; + u32x s1[4]; s1[0] = salt_buf1[0]; s1[1] = salt_buf1[1]; s1[2] = salt_buf1[2]; s1[3] = salt_buf1[3]; - u32 s2[4]; + u32x s2[4]; s2[0] = 0; s2[1] = 0; s2[2] = 0; s2[3] = 0; - u32 s3[4]; + u32x s3[4]; s3[0] = 0; s3[1] = 0; s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, salt_len + pw_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, salt_len + pw_len); w0_t[0] |= s0[0]; w0_t[1] |= s0[1]; @@ -192,10 +192,10 @@ static void m03800m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * md5 */ - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32x a = MD5M_A; + u32x b = MD5M_B; + u32x c = MD5M_C; + u32x d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); @@ -266,12 +266,7 @@ static void m03800m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - const u32 r0 = a; - const u32 r1 = d; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_M + COMPARE_M_SIMD (a, d, c, b); } } @@ -338,34 +333,34 @@ static void m03800s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - u32 w0_t[4]; + u32x w0_t[4]; - w0_t[0] = w0[0]; + w0_t[0] = w0lr; w0_t[1] = w0[1]; w0_t[2] = w0[2]; w0_t[3] = w0[3]; - u32 w1_t[4]; + u32x w1_t[4]; w1_t[0] = w1[0]; w1_t[1] = w1[1]; w1_t[2] = w1[2]; w1_t[3] = w1[3]; - u32 w2_t[4]; + u32x w2_t[4]; w2_t[0] = w2[0]; w2_t[1] = w2[1]; w2_t[2] = w2[2]; w2_t[3] = w2[3]; - u32 w3_t[4]; + u32x w3_t[4]; w3_t[0] = w3[0]; w3_t[1] = w3[1]; @@ -377,7 +372,7 @@ static void m03800s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * prepend salt */ - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w0_t[0] |= salt_buf0[0]; w0_t[1] |= salt_buf0[1]; @@ -400,35 +395,35 @@ static void m03800s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * append salt */ - u32 s0[4]; + u32x s0[4]; s0[0] = salt_buf0[0]; s0[1] = salt_buf0[1]; s0[2] = salt_buf0[2]; s0[3] = salt_buf0[3]; - u32 s1[4]; + u32x s1[4]; s1[0] = salt_buf1[0]; s1[1] = salt_buf1[1]; s1[2] = salt_buf1[2]; s1[3] = salt_buf1[3]; - u32 s2[4]; + u32x s2[4]; s2[0] = 0; s2[1] = 0; s2[2] = 0; s2[3] = 0; - u32 s3[4]; + u32x s3[4]; s3[0] = 0; s3[1] = 0; s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, salt_len + pw_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, salt_len + pw_len); w0_t[0] |= s0[0]; w0_t[1] |= s0[1]; @@ -455,10 +450,10 @@ static void m03800s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * md5 */ - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32x a = MD5M_A; + u32x b = MD5M_B; + u32x c = MD5M_C; + u32x d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); @@ -528,12 +523,7 @@ static void m03800s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - const u32 r0 = a; - const u32 r1 = d; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_S + COMPARE_S_SIMD (a, d, c, b); } } diff --git a/OpenCL/m04310_a1.cl b/OpenCL/m04310_a1.cl index 1728cd5a3..08bf8143c 100644 --- a/OpenCL/m04310_a1.cl +++ b/OpenCL/m04310_a1.cl @@ -89,7 +89,7 @@ __kernel void m04310_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -149,7 +149,7 @@ __kernel void m04310_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -441,7 +441,7 @@ __kernel void m04310_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -513,7 +513,7 @@ __kernel void m04310_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m04310_a3.cl b/OpenCL/m04310_a3.cl index 1d350be1c..4de070349 100644 --- a/OpenCL/m04310_a3.cl +++ b/OpenCL/m04310_a3.cl @@ -5,6 +5,8 @@ #define _MD5_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,11 +18,17 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" +#include "OpenCL/simd.c" -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" - -#define uint_to_hex_lower8(i) l_bin2asc[(i)] +#if VECT_SIZE == 1 +#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#endif static void m04310m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 l_bin2asc[256]) { @@ -54,195 +62,219 @@ static void m04310m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32x w0_t[4]; - MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3[3], MD5C0f, MD5S03); + w0_t[0] = w0lr; + w0_t[1] = w0[1]; + w0_t[2] = w0[2]; + w0_t[3] = w0[3]; - MD5_STEP (MD5_Go, a, b, c, d, w0[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3[0], MD5C1f, MD5S13); + u32x w1_t[4]; - MD5_STEP (MD5_H , a, b, c, d, w1[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0[2], MD5C2f, MD5S23); + w1_t[0] = w1[0]; + w1_t[1] = w1[1]; + w1_t[2] = w1[2]; + w1_t[3] = w1[3]; - MD5_STEP (MD5_I , a, b, c, d, w0[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2[1], MD5C3f, MD5S33); + u32x w2_t[4]; + + w2_t[0] = w2[0]; + w2_t[1] = w2[1]; + w2_t[2] = w2[2]; + w2_t[3] = w2[3]; + + u32x w3_t[4]; + + w3_t[0] = w3[0]; + w3_t[1] = w3[1]; + w3_t[2] = w3[2]; + w3_t[3] = w3[3]; + + u32x a = MD5M_A; + u32x b = MD5M_B; + u32x c = MD5M_C; + u32x d = MD5M_D; + + MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); + + MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); + + MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); + + MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); a += MD5M_A; b += MD5M_B; c += MD5M_C; d += MD5M_D; - const u32 w0_t = uint_to_hex_lower8 ((a >> 0) & 255) << 0 - | uint_to_hex_lower8 ((a >> 8) & 255) << 16; - const u32 w1_t = uint_to_hex_lower8 ((a >> 16) & 255) << 0 - | uint_to_hex_lower8 ((a >> 24) & 255) << 16; - const u32 w2_t = uint_to_hex_lower8 ((b >> 0) & 255) << 0 - | uint_to_hex_lower8 ((b >> 8) & 255) << 16; - const u32 w3_t = uint_to_hex_lower8 ((b >> 16) & 255) << 0 - | uint_to_hex_lower8 ((b >> 24) & 255) << 16; - const u32 w4_t = uint_to_hex_lower8 ((c >> 0) & 255) << 0 - | uint_to_hex_lower8 ((c >> 8) & 255) << 16; - const u32 w5_t = uint_to_hex_lower8 ((c >> 16) & 255) << 0 - | uint_to_hex_lower8 ((c >> 24) & 255) << 16; - const u32 w6_t = uint_to_hex_lower8 ((d >> 0) & 255) << 0 - | uint_to_hex_lower8 ((d >> 8) & 255) << 16; - const u32 w7_t = uint_to_hex_lower8 ((d >> 16) & 255) << 0 - | uint_to_hex_lower8 ((d >> 24) & 255) << 16; + w0_t[0] = uint_to_hex_upper8 ((a >> 0) & 255) << 0 + | uint_to_hex_upper8 ((a >> 8) & 255) << 16; + w0_t[1] = uint_to_hex_upper8 ((a >> 16) & 255) << 0 + | uint_to_hex_upper8 ((a >> 24) & 255) << 16; + w0_t[2] = uint_to_hex_upper8 ((b >> 0) & 255) << 0 + | uint_to_hex_upper8 ((b >> 8) & 255) << 16; + w0_t[3] = uint_to_hex_upper8 ((b >> 16) & 255) << 0 + | uint_to_hex_upper8 ((b >> 24) & 255) << 16; + w1_t[0] = uint_to_hex_upper8 ((c >> 0) & 255) << 0 + | uint_to_hex_upper8 ((c >> 8) & 255) << 16; + w1_t[1] = uint_to_hex_upper8 ((c >> 16) & 255) << 0 + | uint_to_hex_upper8 ((c >> 24) & 255) << 16; + w1_t[2] = uint_to_hex_upper8 ((d >> 0) & 255) << 0 + | uint_to_hex_upper8 ((d >> 8) & 255) << 16; + w1_t[3] = uint_to_hex_upper8 ((d >> 16) & 255) << 0 + | uint_to_hex_upper8 ((d >> 24) & 255) << 16; - const u32 w8_t = s[0]; - const u32 w9_t = s[1]; - const u32 wa_t = s[2]; - const u32 wb_t = s[3]; - const u32 wc_t = s[4]; - const u32 wd_t = s[5]; - const u32 we_t = s[6]; - const u32 wf_t = s[7]; + w2_t[0] = s[0]; + w2_t[1] = s[1]; + w2_t[2] = s[2]; + w2_t[3] = s[3]; + + w3_t[0] = s[4]; + w3_t[1] = s[5]; + w3_t[2] = s[6]; + w3_t[3] = s[7]; a = MD5M_A; b = MD5M_B; c = MD5M_C; d = MD5M_D; - MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - const u32 r0 = a; - const u32 r1 = d; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_M + COMPARE_M_SIMD (a, d, c, b); } } @@ -290,195 +322,219 @@ static void m04310s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32x w0_t[4]; - MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3[3], MD5C0f, MD5S03); + w0_t[0] = w0lr; + w0_t[1] = w0[1]; + w0_t[2] = w0[2]; + w0_t[3] = w0[3]; - MD5_STEP (MD5_Go, a, b, c, d, w0[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3[0], MD5C1f, MD5S13); + u32x w1_t[4]; - MD5_STEP (MD5_H , a, b, c, d, w1[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0[2], MD5C2f, MD5S23); + w1_t[0] = w1[0]; + w1_t[1] = w1[1]; + w1_t[2] = w1[2]; + w1_t[3] = w1[3]; - MD5_STEP (MD5_I , a, b, c, d, w0[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2[1], MD5C3f, MD5S33); + u32x w2_t[4]; + + w2_t[0] = w2[0]; + w2_t[1] = w2[1]; + w2_t[2] = w2[2]; + w2_t[3] = w2[3]; + + u32x w3_t[4]; + + w3_t[0] = w3[0]; + w3_t[1] = w3[1]; + w3_t[2] = w3[2]; + w3_t[3] = w3[3]; + + u32x a = MD5M_A; + u32x b = MD5M_B; + u32x c = MD5M_C; + u32x d = MD5M_D; + + MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); + + MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); + + MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); + + MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); a += MD5M_A; b += MD5M_B; c += MD5M_C; d += MD5M_D; - const u32 w0_t = uint_to_hex_lower8 ((a >> 0) & 255) << 0 - | uint_to_hex_lower8 ((a >> 8) & 255) << 16; - const u32 w1_t = uint_to_hex_lower8 ((a >> 16) & 255) << 0 - | uint_to_hex_lower8 ((a >> 24) & 255) << 16; - const u32 w2_t = uint_to_hex_lower8 ((b >> 0) & 255) << 0 - | uint_to_hex_lower8 ((b >> 8) & 255) << 16; - const u32 w3_t = uint_to_hex_lower8 ((b >> 16) & 255) << 0 - | uint_to_hex_lower8 ((b >> 24) & 255) << 16; - const u32 w4_t = uint_to_hex_lower8 ((c >> 0) & 255) << 0 - | uint_to_hex_lower8 ((c >> 8) & 255) << 16; - const u32 w5_t = uint_to_hex_lower8 ((c >> 16) & 255) << 0 - | uint_to_hex_lower8 ((c >> 24) & 255) << 16; - const u32 w6_t = uint_to_hex_lower8 ((d >> 0) & 255) << 0 - | uint_to_hex_lower8 ((d >> 8) & 255) << 16; - const u32 w7_t = uint_to_hex_lower8 ((d >> 16) & 255) << 0 - | uint_to_hex_lower8 ((d >> 24) & 255) << 16; + w0_t[0] = uint_to_hex_upper8 ((a >> 0) & 255) << 0 + | uint_to_hex_upper8 ((a >> 8) & 255) << 16; + w0_t[1] = uint_to_hex_upper8 ((a >> 16) & 255) << 0 + | uint_to_hex_upper8 ((a >> 24) & 255) << 16; + w0_t[2] = uint_to_hex_upper8 ((b >> 0) & 255) << 0 + | uint_to_hex_upper8 ((b >> 8) & 255) << 16; + w0_t[3] = uint_to_hex_upper8 ((b >> 16) & 255) << 0 + | uint_to_hex_upper8 ((b >> 24) & 255) << 16; + w1_t[0] = uint_to_hex_upper8 ((c >> 0) & 255) << 0 + | uint_to_hex_upper8 ((c >> 8) & 255) << 16; + w1_t[1] = uint_to_hex_upper8 ((c >> 16) & 255) << 0 + | uint_to_hex_upper8 ((c >> 24) & 255) << 16; + w1_t[2] = uint_to_hex_upper8 ((d >> 0) & 255) << 0 + | uint_to_hex_upper8 ((d >> 8) & 255) << 16; + w1_t[3] = uint_to_hex_upper8 ((d >> 16) & 255) << 0 + | uint_to_hex_upper8 ((d >> 24) & 255) << 16; - const u32 w8_t = s[0]; - const u32 w9_t = s[1]; - const u32 wa_t = s[2]; - const u32 wb_t = s[3]; - const u32 wc_t = s[4]; - const u32 wd_t = s[5]; - const u32 we_t = s[6]; - const u32 wf_t = s[7]; + w2_t[0] = s[0]; + w2_t[1] = s[1]; + w2_t[2] = s[2]; + w2_t[3] = s[3]; + + w3_t[0] = s[4]; + w3_t[1] = s[5]; + w3_t[2] = s[6]; + w3_t[3] = s[7]; a = MD5M_A; b = MD5M_B; c = MD5M_C; d = MD5M_D; - MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2_t, MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3_t, MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w4_t, MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w5_t, MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w6_t, MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w7_t, MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w8_t, MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w9_t, MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, wa_t, MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wb_t, MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, wc_t, MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, wd_t, MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, we_t, MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, wf_t, MD5C0f, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - MD5_STEP (MD5_Go, a, b, c, d, w1_t, MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w6_t, MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wb_t, MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0_t, MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w5_t, MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, wa_t, MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, wf_t, MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w4_t, MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w9_t, MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, we_t, MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3_t, MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w8_t, MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, wd_t, MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2_t, MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w7_t, MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, wc_t, MD5C1f, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - MD5_STEP (MD5_H , a, b, c, d, w5_t, MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w8_t, MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wb_t, MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, we_t, MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w1_t, MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w4_t, MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w7_t, MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, wa_t, MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, wd_t, MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0_t, MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3_t, MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w6_t, MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w9_t, MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, wc_t, MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, wf_t, MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2_t, MD5C2f, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - MD5_STEP (MD5_I , a, b, c, d, w0_t, MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w7_t, MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, we_t, MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w5_t, MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, wc_t, MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3_t, MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, wa_t, MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1_t, MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w8_t, MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wf_t, MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w6_t, MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, wd_t, MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w4_t, MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, wb_t, MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - const u32 r0 = a; - const u32 r1 = d; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_S + COMPARE_S_SIMD (a, d, c, b); } } diff --git a/OpenCL/m04400_a1.cl b/OpenCL/m04400_a1.cl index 39ac476a0..f1826d2cc 100644 --- a/OpenCL/m04400_a1.cl +++ b/OpenCL/m04400_a1.cl @@ -89,7 +89,7 @@ __kernel void m04400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -134,7 +134,7 @@ __kernel void m04400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordr0, wordr1, pw_r_len); - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -483,7 +483,7 @@ __kernel void m04400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -540,7 +540,7 @@ __kernel void m04400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordr0, wordr1, pw_r_len); - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m04400_a3.cl b/OpenCL/m04400_a3.cl index 6f05d0d97..aef6da76c 100644 --- a/OpenCL/m04400_a3.cl +++ b/OpenCL/m04400_a3.cl @@ -5,6 +5,8 @@ #define _MD5_SHA1_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,11 +18,17 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" +#include "OpenCL/simd.c" -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" - -#define uint_to_hex_lower8(i) l_bin2asc[(i)] +#if VECT_SIZE == 1 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#endif static void m04400m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 l_bin2asc[256]) { @@ -41,38 +49,38 @@ static void m04400m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; /** * sha1 */ - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = 0; - u32 wf_t = pw_len * 8; + u32x w0_t = w0lr; + u32x w1_t = w0[1]; + u32x w2_t = w0[2]; + u32x w3_t = w0[3]; + u32x w4_t = w1[0]; + u32x w5_t = w1[1]; + u32x w6_t = w1[2]; + u32x w7_t = w1[3]; + u32x w8_t = w2[0]; + u32x w9_t = w2[1]; + u32x wa_t = w2[2]; + u32x wb_t = w2[3]; + u32x wc_t = w3[0]; + u32x wd_t = w3[1]; + u32x we_t = 0; + u32x wf_t = pw_len * 8; - u32 a = SHA1M_A; - u32 b = SHA1M_B; - u32 c = SHA1M_C; - u32 d = SHA1M_D; - u32 e = SHA1M_E; + u32x a = SHA1M_A; + u32x b = SHA1M_B; + u32x c = SHA1M_C; + u32x d = SHA1M_D; + u32x e = SHA1M_E; #undef K #define K SHA1C00 @@ -281,12 +289,7 @@ static void m04400m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); - const u32 r0 = a; - const u32 r1 = d; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_M + COMPARE_M_SIMD (a, d, c, b); } } @@ -317,38 +320,38 @@ static void m04400s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; /** * sha1 */ - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = 0; - u32 wf_t = pw_len * 8; + u32x w0_t = w0lr; + u32x w1_t = w0[1]; + u32x w2_t = w0[2]; + u32x w3_t = w0[3]; + u32x w4_t = w1[0]; + u32x w5_t = w1[1]; + u32x w6_t = w1[2]; + u32x w7_t = w1[3]; + u32x w8_t = w2[0]; + u32x w9_t = w2[1]; + u32x wa_t = w2[2]; + u32x wb_t = w2[3]; + u32x wc_t = w3[0]; + u32x wd_t = w3[1]; + u32x we_t = 0; + u32x wf_t = pw_len * 8; - u32 a = SHA1M_A; - u32 b = SHA1M_B; - u32 c = SHA1M_C; - u32 d = SHA1M_D; - u32 e = SHA1M_E; + u32x a = SHA1M_A; + u32x b = SHA1M_B; + u32x c = SHA1M_C; + u32x d = SHA1M_D; + u32x e = SHA1M_E; #undef K #define K SHA1C00 @@ -557,12 +560,7 @@ static void m04400s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le MD5_STEP (MD5_I , c, d, a, b, w2_t, MD5C3e, MD5S32); MD5_STEP (MD5_I , b, c, d, a, w9_t, MD5C3f, MD5S33); - const u32 r0 = a; - const u32 r1 = d; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_S + COMPARE_S_SIMD (a, d, c, b); } } diff --git a/OpenCL/m04500_a1.cl b/OpenCL/m04500_a1.cl index dd69d9264..a52603961 100644 --- a/OpenCL/m04500_a1.cl +++ b/OpenCL/m04500_a1.cl @@ -89,7 +89,7 @@ __kernel void m04500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -134,7 +134,7 @@ __kernel void m04500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordr0, wordr1, pw_r_len); - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -512,7 +512,7 @@ __kernel void m04500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -575,7 +575,7 @@ __kernel void m04500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordr0, wordr1, pw_r_len); - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m04500_a3.cl b/OpenCL/m04500_a3.cl index a3fa7b69d..28fd960ac 100644 --- a/OpenCL/m04500_a3.cl +++ b/OpenCL/m04500_a3.cl @@ -5,6 +5,8 @@ #define _SHA1_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,11 +18,17 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" +#include "OpenCL/simd.c" -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" - -#define uint_to_hex_lower8_le(i) l_bin2asc[(i)] +#if VECT_SIZE == 1 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#endif static void m04500m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 l_bin2asc[256]) { @@ -37,38 +45,38 @@ static void m04500m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; /** * sha1 */ - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = 0; - u32 wf_t = pw_len * 8; + u32x w0_t = w0lr; + u32x w1_t = w0[1]; + u32x w2_t = w0[2]; + u32x w3_t = w0[3]; + u32x w4_t = w1[0]; + u32x w5_t = w1[1]; + u32x w6_t = w1[2]; + u32x w7_t = w1[3]; + u32x w8_t = w2[0]; + u32x w9_t = w2[1]; + u32x wa_t = w2[2]; + u32x wb_t = w2[3]; + u32x wc_t = w3[0]; + u32x wd_t = w3[1]; + u32x we_t = 0; + u32x wf_t = pw_len * 8; - u32 a = SHA1M_A; - u32 b = SHA1M_B; - u32 c = SHA1M_C; - u32 d = SHA1M_D; - u32 e = SHA1M_E; + u32x a = SHA1M_A; + u32x b = SHA1M_B; + u32x c = SHA1M_C; + u32x d = SHA1M_D; + u32x e = SHA1M_E; #undef K #define K SHA1C00 @@ -306,12 +314,7 @@ static void m04500m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we_t); wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf_t); - const u32 r0 = d; - const u32 r1 = e; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_M + COMPARE_M_SIMD (d, e, c, b); } } @@ -340,7 +343,7 @@ static void m04500s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * reverse */ - const u32 e_rev = rotl32 (search[1], 2u); + const u32 e_rev = rotl32_S (search[1], 2u); /** * loop @@ -348,38 +351,38 @@ static void m04500s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; /** * sha1 */ - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = 0; - u32 wf_t = pw_len * 8; + u32x w0_t = w0lr; + u32x w1_t = w0[1]; + u32x w2_t = w0[2]; + u32x w3_t = w0[3]; + u32x w4_t = w1[0]; + u32x w5_t = w1[1]; + u32x w6_t = w1[2]; + u32x w7_t = w1[3]; + u32x w8_t = w2[0]; + u32x w9_t = w2[1]; + u32x wa_t = w2[2]; + u32x wb_t = w2[3]; + u32x wc_t = w3[0]; + u32x wd_t = w3[1]; + u32x we_t = 0; + u32x wf_t = pw_len * 8; - u32 a = SHA1M_A; - u32 b = SHA1M_B; - u32 c = SHA1M_C; - u32 d = SHA1M_D; - u32 e = SHA1M_E; + u32x a = SHA1M_A; + u32x b = SHA1M_B; + u32x c = SHA1M_C; + u32x d = SHA1M_D; + u32x e = SHA1M_E; #undef K #define K SHA1C00 @@ -613,19 +616,14 @@ static void m04500s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wa_t); wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, wb_t); - if (allx (e != e_rev)) continue; + if (MATCHES_NONE_VS (e, e_rev)) continue; wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wc_t); wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wd_t); we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we_t); wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf_t); - const u32 r0 = d; - const u32 r1 = e; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_S + COMPARE_S_SIMD (d, e, c, b); } } diff --git a/OpenCL/m04700_a1.cl b/OpenCL/m04700_a1.cl index b6a951b77..b67306106 100644 --- a/OpenCL/m04700_a1.cl +++ b/OpenCL/m04700_a1.cl @@ -90,7 +90,7 @@ __kernel void m04700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -135,7 +135,7 @@ __kernel void m04700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordr0, wordr1, pw_r_len); - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -466,7 +466,7 @@ __kernel void m04700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -529,7 +529,7 @@ __kernel void m04700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordr0, wordr1, pw_r_len); - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m04700_a3.cl b/OpenCL/m04700_a3.cl index 0abaf6971..da2118277 100644 --- a/OpenCL/m04700_a3.cl +++ b/OpenCL/m04700_a3.cl @@ -5,6 +5,8 @@ #define _SHA1_MD5_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -17,11 +19,17 @@ #undef _MD5_ #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" +#include "OpenCL/simd.c" -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" - -#define uint_to_hex_lower8_le(i) l_bin2asc[(i)] +#if VECT_SIZE == 1 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#endif static void m04700m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 l_bin2asc[256]) { @@ -38,88 +46,116 @@ static void m04700m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; + + u32x w0_t[4]; + + w0_t[0] = w0lr; + w0_t[1] = w0[1]; + w0_t[2] = w0[2]; + w0_t[3] = w0[3]; + + u32x w1_t[4]; + + w1_t[0] = w1[0]; + w1_t[1] = w1[1]; + w1_t[2] = w1[2]; + w1_t[3] = w1[3]; + + u32x w2_t[4]; + + w2_t[0] = w2[0]; + w2_t[1] = w2[1]; + w2_t[2] = w2[2]; + w2_t[3] = w2[3]; + + u32x w3_t[4]; + + w3_t[0] = w3[0]; + w3_t[1] = w3[1]; + w3_t[2] = w3[2]; + w3_t[3] = w3[3]; /** * md5 */ - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32x a = MD5M_A; + u32x b = MD5M_B; + u32x c = MD5M_C; + u32x d = MD5M_D; - MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3[3], MD5C0f, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - MD5_STEP (MD5_Go, a, b, c, d, w0[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3[0], MD5C1f, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - MD5_STEP (MD5_H , a, b, c, d, w1[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0[2], MD5C2f, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - MD5_STEP (MD5_I , a, b, c, d, w0[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2[1], MD5C3f, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); a += MD5M_A; b += MD5M_B; @@ -130,33 +166,34 @@ static void m04700m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * sha1 */ - u32 w0_t = uint_to_hex_lower8_le ((a >> 8) & 255) << 0 - | uint_to_hex_lower8_le ((a >> 0) & 255) << 16; - u32 w1_t = uint_to_hex_lower8_le ((a >> 24) & 255) << 0 - | uint_to_hex_lower8_le ((a >> 16) & 255) << 16; - u32 w2_t = uint_to_hex_lower8_le ((b >> 8) & 255) << 0 - | uint_to_hex_lower8_le ((b >> 0) & 255) << 16; - u32 w3_t = uint_to_hex_lower8_le ((b >> 24) & 255) << 0 - | uint_to_hex_lower8_le ((b >> 16) & 255) << 16; - u32 w4_t = uint_to_hex_lower8_le ((c >> 8) & 255) << 0 - | uint_to_hex_lower8_le ((c >> 0) & 255) << 16; - u32 w5_t = uint_to_hex_lower8_le ((c >> 24) & 255) << 0 - | uint_to_hex_lower8_le ((c >> 16) & 255) << 16; - u32 w6_t = uint_to_hex_lower8_le ((d >> 8) & 255) << 0 - | uint_to_hex_lower8_le ((d >> 0) & 255) << 16; - u32 w7_t = uint_to_hex_lower8_le ((d >> 24) & 255) << 0 - | uint_to_hex_lower8_le ((d >> 16) & 255) << 16; + w0_t[0] = uint_to_hex_lower8_le ((a >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 0) & 255) << 16; + w0_t[1] = uint_to_hex_lower8_le ((a >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 16) & 255) << 16; + w0_t[2] = uint_to_hex_lower8_le ((b >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 0) & 255) << 16; + w0_t[3] = uint_to_hex_lower8_le ((b >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 16) & 255) << 16; + w1_t[0] = uint_to_hex_lower8_le ((c >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 0) & 255) << 16; + w1_t[1] = uint_to_hex_lower8_le ((c >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 16) & 255) << 16; + w1_t[2] = uint_to_hex_lower8_le ((d >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 0) & 255) << 16; + w1_t[3] = uint_to_hex_lower8_le ((d >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 16) & 255) << 16; - u32 w8_t = 0x80000000; - u32 w9_t = 0; - u32 wa_t = 0; - u32 wb_t = 0; - u32 wc_t = 0; - u32 wd_t = 0; - u32 we_t = 0; - u32 wf_t = 32 * 8; + w2_t[0] = 0x80000000; + w2_t[1] = 0; + w2_t[2] = 0; + w2_t[3] = 0; - u32 e; + w3_t[0] = 0; + w3_t[1] = 0; + w3_t[2] = 0; + w3_t[3] = 32 * 8; + + u32x e; a = SHA1M_A; b = SHA1M_B; @@ -167,105 +204,100 @@ static void m04700m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le #undef K #define K SHA1C00 - SHA1_STEP (SHA1_F0o, a, b, c, d, e, w0_t); - SHA1_STEP (SHA1_F0o, e, a, b, c, d, w1_t); - SHA1_STEP (SHA1_F0o, d, e, a, b, c, w2_t); - SHA1_STEP (SHA1_F0o, c, d, e, a, b, w3_t); - SHA1_STEP (SHA1_F0o, b, c, d, e, a, w4_t); - SHA1_STEP (SHA1_F0o, a, b, c, d, e, w5_t); - SHA1_STEP (SHA1_F0o, e, a, b, c, d, w6_t); - SHA1_STEP (SHA1_F0o, d, e, a, b, c, w7_t); - SHA1_STEP (SHA1_F0o, c, d, e, a, b, w8_t); - SHA1_STEP (SHA1_F0o, b, c, d, e, a, w9_t); - SHA1_STEP (SHA1_F0o, a, b, c, d, e, wa_t); - SHA1_STEP (SHA1_F0o, e, a, b, c, d, wb_t); - SHA1_STEP (SHA1_F0o, d, e, a, b, c, wc_t); - SHA1_STEP (SHA1_F0o, c, d, e, a, b, wd_t); - SHA1_STEP (SHA1_F0o, b, c, d, e, a, we_t); - SHA1_STEP (SHA1_F0o, a, b, c, d, e, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F0o, e, a, b, c, d, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F0o, d, e, a, b, c, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F0o, c, d, e, a, b, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F0o, b, c, d, e, a, w3_t); + SHA1_STEP (SHA1_F0o, a, b, c, d, e, w0_t[0]); + SHA1_STEP (SHA1_F0o, e, a, b, c, d, w0_t[1]); + SHA1_STEP (SHA1_F0o, d, e, a, b, c, w0_t[2]); + SHA1_STEP (SHA1_F0o, c, d, e, a, b, w0_t[3]); + SHA1_STEP (SHA1_F0o, b, c, d, e, a, w1_t[0]); + SHA1_STEP (SHA1_F0o, a, b, c, d, e, w1_t[1]); + SHA1_STEP (SHA1_F0o, e, a, b, c, d, w1_t[2]); + SHA1_STEP (SHA1_F0o, d, e, a, b, c, w1_t[3]); + SHA1_STEP (SHA1_F0o, c, d, e, a, b, w2_t[0]); + SHA1_STEP (SHA1_F0o, b, c, d, e, a, w2_t[1]); + SHA1_STEP (SHA1_F0o, a, b, c, d, e, w2_t[2]); + SHA1_STEP (SHA1_F0o, e, a, b, c, d, w2_t[3]); + SHA1_STEP (SHA1_F0o, d, e, a, b, c, w3_t[0]); + SHA1_STEP (SHA1_F0o, c, d, e, a, b, w3_t[1]); + SHA1_STEP (SHA1_F0o, b, c, d, e, a, w3_t[2]); + SHA1_STEP (SHA1_F0o, a, b, c, d, e, w3_t[3]); + w0_t[0] = rotl32 ((w3_t[1] ^ w2_t[0] ^ w0_t[2] ^ w0_t[0]), 1u); SHA1_STEP (SHA1_F0o, e, a, b, c, d, w0_t[0]); + w0_t[1] = rotl32 ((w3_t[2] ^ w2_t[1] ^ w0_t[3] ^ w0_t[1]), 1u); SHA1_STEP (SHA1_F0o, d, e, a, b, c, w0_t[1]); + w0_t[2] = rotl32 ((w3_t[3] ^ w2_t[2] ^ w1_t[0] ^ w0_t[2]), 1u); SHA1_STEP (SHA1_F0o, c, d, e, a, b, w0_t[2]); + w0_t[3] = rotl32 ((w0_t[0] ^ w2_t[3] ^ w1_t[1] ^ w0_t[3]), 1u); SHA1_STEP (SHA1_F0o, b, c, d, e, a, w0_t[3]); #undef K #define K SHA1C01 - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w7_t); + w1_t[0] = rotl32 ((w0_t[1] ^ w3_t[0] ^ w1_t[2] ^ w1_t[0]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w1_t[0]); + w1_t[1] = rotl32 ((w0_t[2] ^ w3_t[1] ^ w1_t[3] ^ w1_t[1]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w1_t[1]); + w1_t[2] = rotl32 ((w0_t[3] ^ w3_t[2] ^ w2_t[0] ^ w1_t[2]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w1_t[2]); + w1_t[3] = rotl32 ((w1_t[0] ^ w3_t[3] ^ w2_t[1] ^ w1_t[3]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w1_t[3]); + w2_t[0] = rotl32 ((w1_t[1] ^ w0_t[0] ^ w2_t[2] ^ w2_t[0]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w2_t[0]); + w2_t[1] = rotl32 ((w1_t[2] ^ w0_t[1] ^ w2_t[3] ^ w2_t[1]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w2_t[1]); + w2_t[2] = rotl32 ((w1_t[3] ^ w0_t[2] ^ w3_t[0] ^ w2_t[2]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w2_t[2]); + w2_t[3] = rotl32 ((w2_t[0] ^ w0_t[3] ^ w3_t[1] ^ w2_t[3]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w2_t[3]); + w3_t[0] = rotl32 ((w2_t[1] ^ w1_t[0] ^ w3_t[2] ^ w3_t[0]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w3_t[0]); + w3_t[1] = rotl32 ((w2_t[2] ^ w1_t[1] ^ w3_t[3] ^ w3_t[1]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w3_t[1]); + w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w3_t[2]); + w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w3_t[3]); + w0_t[0] = rotl32 ((w3_t[1] ^ w2_t[0] ^ w0_t[2] ^ w0_t[0]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w0_t[0]); + w0_t[1] = rotl32 ((w3_t[2] ^ w2_t[1] ^ w0_t[3] ^ w0_t[1]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w0_t[1]); + w0_t[2] = rotl32 ((w3_t[3] ^ w2_t[2] ^ w1_t[0] ^ w0_t[2]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w0_t[2]); + w0_t[3] = rotl32 ((w0_t[0] ^ w2_t[3] ^ w1_t[1] ^ w0_t[3]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w0_t[3]); + w1_t[0] = rotl32 ((w0_t[1] ^ w3_t[0] ^ w1_t[2] ^ w1_t[0]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w1_t[0]); + w1_t[1] = rotl32 ((w0_t[2] ^ w3_t[1] ^ w1_t[3] ^ w1_t[1]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w1_t[1]); + w1_t[2] = rotl32 ((w0_t[3] ^ w3_t[2] ^ w2_t[0] ^ w1_t[2]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w1_t[2]); + w1_t[3] = rotl32 ((w1_t[0] ^ w3_t[3] ^ w2_t[1] ^ w1_t[3]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w1_t[3]); #undef K #define K SHA1C02 - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, wb_t); + w2_t[0] = rotl32 ((w1_t[1] ^ w0_t[0] ^ w2_t[2] ^ w2_t[0]), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, w2_t[0]); + w2_t[1] = rotl32 ((w1_t[2] ^ w0_t[1] ^ w2_t[3] ^ w2_t[1]), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, w2_t[1]); + w2_t[2] = rotl32 ((w1_t[3] ^ w0_t[2] ^ w3_t[0] ^ w2_t[2]), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, w2_t[2]); + w2_t[3] = rotl32 ((w2_t[0] ^ w0_t[3] ^ w3_t[1] ^ w2_t[3]), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, w2_t[3]); + w3_t[0] = rotl32 ((w2_t[1] ^ w1_t[0] ^ w3_t[2] ^ w3_t[0]), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, w3_t[0]); + w3_t[1] = rotl32 ((w2_t[2] ^ w1_t[1] ^ w3_t[3] ^ w3_t[1]), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, w3_t[1]); + w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, w3_t[2]); + w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, w3_t[3]); + w0_t[0] = rotl32 ((w3_t[1] ^ w2_t[0] ^ w0_t[2] ^ w0_t[0]), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, w0_t[0]); + w0_t[1] = rotl32 ((w3_t[2] ^ w2_t[1] ^ w0_t[3] ^ w0_t[1]), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, w0_t[1]); + w0_t[2] = rotl32 ((w3_t[3] ^ w2_t[2] ^ w1_t[0] ^ w0_t[2]), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, w0_t[2]); + w0_t[3] = rotl32 ((w0_t[0] ^ w2_t[3] ^ w1_t[1] ^ w0_t[3]), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, w0_t[3]); + w1_t[0] = rotl32 ((w0_t[1] ^ w3_t[0] ^ w1_t[2] ^ w1_t[0]), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, w1_t[0]); + w1_t[1] = rotl32 ((w0_t[2] ^ w3_t[1] ^ w1_t[3] ^ w1_t[1]), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, w1_t[1]); + w1_t[2] = rotl32 ((w0_t[3] ^ w3_t[2] ^ w2_t[0] ^ w1_t[2]), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, w1_t[2]); + w1_t[3] = rotl32 ((w1_t[0] ^ w3_t[3] ^ w2_t[1] ^ w1_t[3]), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, w1_t[3]); + w2_t[0] = rotl32 ((w1_t[1] ^ w0_t[0] ^ w2_t[2] ^ w2_t[0]), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, w2_t[0]); + w2_t[1] = rotl32 ((w1_t[2] ^ w0_t[1] ^ w2_t[3] ^ w2_t[1]), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, w2_t[1]); + w2_t[2] = rotl32 ((w1_t[3] ^ w0_t[2] ^ w3_t[0] ^ w2_t[2]), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, w2_t[2]); + w2_t[3] = rotl32 ((w2_t[0] ^ w0_t[3] ^ w3_t[1] ^ w2_t[3]), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, w2_t[3]); #undef K #define K SHA1C03 - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf_t); + w3_t[0] = rotl32 ((w2_t[1] ^ w1_t[0] ^ w3_t[2] ^ w3_t[0]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w3_t[0]); + w3_t[1] = rotl32 ((w2_t[2] ^ w1_t[1] ^ w3_t[3] ^ w3_t[1]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w3_t[1]); + w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w3_t[2]); + w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w3_t[3]); + w0_t[0] = rotl32 ((w3_t[1] ^ w2_t[0] ^ w0_t[2] ^ w0_t[0]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w0_t[0]); + w0_t[1] = rotl32 ((w3_t[2] ^ w2_t[1] ^ w0_t[3] ^ w0_t[1]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w0_t[1]); + w0_t[2] = rotl32 ((w3_t[3] ^ w2_t[2] ^ w1_t[0] ^ w0_t[2]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w0_t[2]); + w0_t[3] = rotl32 ((w0_t[0] ^ w2_t[3] ^ w1_t[1] ^ w0_t[3]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w0_t[3]); + w1_t[0] = rotl32 ((w0_t[1] ^ w3_t[0] ^ w1_t[2] ^ w1_t[0]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w1_t[0]); + w1_t[1] = rotl32 ((w0_t[2] ^ w3_t[1] ^ w1_t[3] ^ w1_t[1]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w1_t[1]); + w1_t[2] = rotl32 ((w0_t[3] ^ w3_t[2] ^ w2_t[0] ^ w1_t[2]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w1_t[2]); + w1_t[3] = rotl32 ((w1_t[0] ^ w3_t[3] ^ w2_t[1] ^ w1_t[3]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w1_t[3]); + w2_t[0] = rotl32 ((w1_t[1] ^ w0_t[0] ^ w2_t[2] ^ w2_t[0]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w2_t[0]); + w2_t[1] = rotl32 ((w1_t[2] ^ w0_t[1] ^ w2_t[3] ^ w2_t[1]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w2_t[1]); + w2_t[2] = rotl32 ((w1_t[3] ^ w0_t[2] ^ w3_t[0] ^ w2_t[2]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w2_t[2]); + w2_t[3] = rotl32 ((w2_t[0] ^ w0_t[3] ^ w3_t[1] ^ w2_t[3]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w2_t[3]); + w3_t[0] = rotl32 ((w2_t[1] ^ w1_t[0] ^ w3_t[2] ^ w3_t[0]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w3_t[0]); + w3_t[1] = rotl32 ((w2_t[2] ^ w1_t[1] ^ w3_t[3] ^ w3_t[1]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w3_t[1]); + w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w3_t[2]); + w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w3_t[3]); - const u32 r0 = d; - const u32 r1 = e; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_M + COMPARE_M_SIMD (d, e, c, b); } } @@ -294,7 +326,7 @@ static void m04700s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * reverse */ - const u32 e_rev = rotl32 (search[1], 2u); + const u32 e_rev = rotl32_S (search[1], 2u); /** * loop @@ -302,88 +334,116 @@ static void m04700s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; + + u32x w0_t[4]; + + w0_t[0] = w0lr; + w0_t[1] = w0[1]; + w0_t[2] = w0[2]; + w0_t[3] = w0[3]; + + u32x w1_t[4]; + + w1_t[0] = w1[0]; + w1_t[1] = w1[1]; + w1_t[2] = w1[2]; + w1_t[3] = w1[3]; + + u32x w2_t[4]; + + w2_t[0] = w2[0]; + w2_t[1] = w2[1]; + w2_t[2] = w2[2]; + w2_t[3] = w2[3]; + + u32x w3_t[4]; + + w3_t[0] = w3[0]; + w3_t[1] = w3[1]; + w3_t[2] = w3[2]; + w3_t[3] = w3[3]; /** * md5 */ - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32x a = MD5M_A; + u32x b = MD5M_B; + u32x c = MD5M_C; + u32x d = MD5M_D; - MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w0[2], MD5C02, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w0[3], MD5C03, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w1[0], MD5C04, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w1[1], MD5C05, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w1[2], MD5C06, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w1[3], MD5C07, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w2[0], MD5C08, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w2[1], MD5C09, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w2[2], MD5C0a, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w2[3], MD5C0b, MD5S03); - MD5_STEP (MD5_Fo, a, b, c, d, w3[0], MD5C0c, MD5S00); - MD5_STEP (MD5_Fo, d, a, b, c, w3[1], MD5C0d, MD5S01); - MD5_STEP (MD5_Fo, c, d, a, b, w3[2], MD5C0e, MD5S02); - MD5_STEP (MD5_Fo, b, c, d, a, w3[3], MD5C0f, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w0_t[2], MD5C02, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w0_t[3], MD5C03, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w1_t[0], MD5C04, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w1_t[1], MD5C05, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w1_t[2], MD5C06, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w1_t[3], MD5C07, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w2_t[0], MD5C08, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w2_t[1], MD5C09, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w2_t[2], MD5C0a, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w2_t[3], MD5C0b, MD5S03); + MD5_STEP (MD5_Fo, a, b, c, d, w3_t[0], MD5C0c, MD5S00); + MD5_STEP (MD5_Fo, d, a, b, c, w3_t[1], MD5C0d, MD5S01); + MD5_STEP (MD5_Fo, c, d, a, b, w3_t[2], MD5C0e, MD5S02); + MD5_STEP (MD5_Fo, b, c, d, a, w3_t[3], MD5C0f, MD5S03); - MD5_STEP (MD5_Go, a, b, c, d, w0[1], MD5C10, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w1[2], MD5C11, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w2[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0[0], MD5C13, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w1[1], MD5C14, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w2[2], MD5C15, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w3[3], MD5C16, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w1[0], MD5C17, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w2[1], MD5C18, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w3[2], MD5C19, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w0[3], MD5C1a, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w2[0], MD5C1b, MD5S13); - MD5_STEP (MD5_Go, a, b, c, d, w3[1], MD5C1c, MD5S10); - MD5_STEP (MD5_Go, d, a, b, c, w0[2], MD5C1d, MD5S11); - MD5_STEP (MD5_Go, c, d, a, b, w1[3], MD5C1e, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w3[0], MD5C1f, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w0_t[1], MD5C10, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w1_t[2], MD5C11, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w2_t[3], MD5C12, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w0_t[0], MD5C13, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w1_t[1], MD5C14, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w2_t[2], MD5C15, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w3_t[3], MD5C16, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w1_t[0], MD5C17, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w2_t[1], MD5C18, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w3_t[2], MD5C19, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w0_t[3], MD5C1a, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w2_t[0], MD5C1b, MD5S13); + MD5_STEP (MD5_Go, a, b, c, d, w3_t[1], MD5C1c, MD5S10); + MD5_STEP (MD5_Go, d, a, b, c, w0_t[2], MD5C1d, MD5S11); + MD5_STEP (MD5_Go, c, d, a, b, w1_t[3], MD5C1e, MD5S12); + MD5_STEP (MD5_Go, b, c, d, a, w3_t[0], MD5C1f, MD5S13); - MD5_STEP (MD5_H , a, b, c, d, w1[1], MD5C20, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w2[0], MD5C21, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w2[3], MD5C22, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w3[2], MD5C23, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w0[1], MD5C24, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w1[0], MD5C25, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w1[3], MD5C26, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w2[2], MD5C27, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w3[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0[0], MD5C29, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w0[3], MD5C2a, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w1[2], MD5C2b, MD5S23); - MD5_STEP (MD5_H , a, b, c, d, w2[1], MD5C2c, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w3[0], MD5C2d, MD5S21); - MD5_STEP (MD5_H , c, d, a, b, w3[3], MD5C2e, MD5S22); - MD5_STEP (MD5_H , b, c, d, a, w0[2], MD5C2f, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w1_t[1], MD5C20, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w2_t[0], MD5C21, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w2_t[3], MD5C22, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w3_t[2], MD5C23, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w0_t[1], MD5C24, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w1_t[0], MD5C25, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w1_t[3], MD5C26, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w2_t[2], MD5C27, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w3_t[1], MD5C28, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w0_t[0], MD5C29, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w0_t[3], MD5C2a, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w1_t[2], MD5C2b, MD5S23); + MD5_STEP (MD5_H , a, b, c, d, w2_t[1], MD5C2c, MD5S20); + MD5_STEP (MD5_H , d, a, b, c, w3_t[0], MD5C2d, MD5S21); + MD5_STEP (MD5_H , c, d, a, b, w3_t[3], MD5C2e, MD5S22); + MD5_STEP (MD5_H , b, c, d, a, w0_t[2], MD5C2f, MD5S23); - MD5_STEP (MD5_I , a, b, c, d, w0[0], MD5C30, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w1[3], MD5C31, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w3[2], MD5C32, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w1[1], MD5C33, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w3[0], MD5C34, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w0[3], MD5C35, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w2[2], MD5C36, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w0[1], MD5C37, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w2[0], MD5C38, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w3[3], MD5C39, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w1[2], MD5C3a, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w3[1], MD5C3b, MD5S33); - MD5_STEP (MD5_I , a, b, c, d, w1[0], MD5C3c, MD5S30); - MD5_STEP (MD5_I , d, a, b, c, w2[3], MD5C3d, MD5S31); - MD5_STEP (MD5_I , c, d, a, b, w0[2], MD5C3e, MD5S32); - MD5_STEP (MD5_I , b, c, d, a, w2[1], MD5C3f, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w0_t[0], MD5C30, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w1_t[3], MD5C31, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w3_t[2], MD5C32, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w1_t[1], MD5C33, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w3_t[0], MD5C34, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w0_t[3], MD5C35, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w2_t[2], MD5C36, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w0_t[1], MD5C37, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w2_t[0], MD5C38, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w3_t[3], MD5C39, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w1_t[2], MD5C3a, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); + MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); + MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); + MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); + MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); a += MD5M_A; b += MD5M_B; @@ -394,33 +454,34 @@ static void m04700s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * sha1 */ - u32 w0_t = uint_to_hex_lower8_le ((a >> 8) & 255) << 0 - | uint_to_hex_lower8_le ((a >> 0) & 255) << 16; - u32 w1_t = uint_to_hex_lower8_le ((a >> 24) & 255) << 0 - | uint_to_hex_lower8_le ((a >> 16) & 255) << 16; - u32 w2_t = uint_to_hex_lower8_le ((b >> 8) & 255) << 0 - | uint_to_hex_lower8_le ((b >> 0) & 255) << 16; - u32 w3_t = uint_to_hex_lower8_le ((b >> 24) & 255) << 0 - | uint_to_hex_lower8_le ((b >> 16) & 255) << 16; - u32 w4_t = uint_to_hex_lower8_le ((c >> 8) & 255) << 0 - | uint_to_hex_lower8_le ((c >> 0) & 255) << 16; - u32 w5_t = uint_to_hex_lower8_le ((c >> 24) & 255) << 0 - | uint_to_hex_lower8_le ((c >> 16) & 255) << 16; - u32 w6_t = uint_to_hex_lower8_le ((d >> 8) & 255) << 0 - | uint_to_hex_lower8_le ((d >> 0) & 255) << 16; - u32 w7_t = uint_to_hex_lower8_le ((d >> 24) & 255) << 0 - | uint_to_hex_lower8_le ((d >> 16) & 255) << 16; + w0_t[0] = uint_to_hex_lower8_le ((a >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 0) & 255) << 16; + w0_t[1] = uint_to_hex_lower8_le ((a >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((a >> 16) & 255) << 16; + w0_t[2] = uint_to_hex_lower8_le ((b >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 0) & 255) << 16; + w0_t[3] = uint_to_hex_lower8_le ((b >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((b >> 16) & 255) << 16; + w1_t[0] = uint_to_hex_lower8_le ((c >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 0) & 255) << 16; + w1_t[1] = uint_to_hex_lower8_le ((c >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((c >> 16) & 255) << 16; + w1_t[2] = uint_to_hex_lower8_le ((d >> 8) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 0) & 255) << 16; + w1_t[3] = uint_to_hex_lower8_le ((d >> 24) & 255) << 0 + | uint_to_hex_lower8_le ((d >> 16) & 255) << 16; - u32 w8_t = 0x80000000; - u32 w9_t = 0; - u32 wa_t = 0; - u32 wb_t = 0; - u32 wc_t = 0; - u32 wd_t = 0; - u32 we_t = 0; - u32 wf_t = 32 * 8; + w2_t[0] = 0x80000000; + w2_t[1] = 0; + w2_t[2] = 0; + w2_t[3] = 0; - u32 e; + w3_t[0] = 0; + w3_t[1] = 0; + w3_t[2] = 0; + w3_t[3] = 32 * 8; + + u32x e; a = SHA1M_A; b = SHA1M_B; @@ -431,111 +492,107 @@ static void m04700s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le #undef K #define K SHA1C00 - SHA1_STEP (SHA1_F0o, a, b, c, d, e, w0_t); - SHA1_STEP (SHA1_F0o, e, a, b, c, d, w1_t); - SHA1_STEP (SHA1_F0o, d, e, a, b, c, w2_t); - SHA1_STEP (SHA1_F0o, c, d, e, a, b, w3_t); - SHA1_STEP (SHA1_F0o, b, c, d, e, a, w4_t); - SHA1_STEP (SHA1_F0o, a, b, c, d, e, w5_t); - SHA1_STEP (SHA1_F0o, e, a, b, c, d, w6_t); - SHA1_STEP (SHA1_F0o, d, e, a, b, c, w7_t); - SHA1_STEP (SHA1_F0o, c, d, e, a, b, w8_t); - SHA1_STEP (SHA1_F0o, b, c, d, e, a, w9_t); - SHA1_STEP (SHA1_F0o, a, b, c, d, e, wa_t); - SHA1_STEP (SHA1_F0o, e, a, b, c, d, wb_t); - SHA1_STEP (SHA1_F0o, d, e, a, b, c, wc_t); - SHA1_STEP (SHA1_F0o, c, d, e, a, b, wd_t); - SHA1_STEP (SHA1_F0o, b, c, d, e, a, we_t); - SHA1_STEP (SHA1_F0o, a, b, c, d, e, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F0o, e, a, b, c, d, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F0o, d, e, a, b, c, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F0o, c, d, e, a, b, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F0o, b, c, d, e, a, w3_t); + SHA1_STEP (SHA1_F0o, a, b, c, d, e, w0_t[0]); + SHA1_STEP (SHA1_F0o, e, a, b, c, d, w0_t[1]); + SHA1_STEP (SHA1_F0o, d, e, a, b, c, w0_t[2]); + SHA1_STEP (SHA1_F0o, c, d, e, a, b, w0_t[3]); + SHA1_STEP (SHA1_F0o, b, c, d, e, a, w1_t[0]); + SHA1_STEP (SHA1_F0o, a, b, c, d, e, w1_t[1]); + SHA1_STEP (SHA1_F0o, e, a, b, c, d, w1_t[2]); + SHA1_STEP (SHA1_F0o, d, e, a, b, c, w1_t[3]); + SHA1_STEP (SHA1_F0o, c, d, e, a, b, w2_t[0]); + SHA1_STEP (SHA1_F0o, b, c, d, e, a, w2_t[1]); + SHA1_STEP (SHA1_F0o, a, b, c, d, e, w2_t[2]); + SHA1_STEP (SHA1_F0o, e, a, b, c, d, w2_t[3]); + SHA1_STEP (SHA1_F0o, d, e, a, b, c, w3_t[0]); + SHA1_STEP (SHA1_F0o, c, d, e, a, b, w3_t[1]); + SHA1_STEP (SHA1_F0o, b, c, d, e, a, w3_t[2]); + SHA1_STEP (SHA1_F0o, a, b, c, d, e, w3_t[3]); + w0_t[0] = rotl32 ((w3_t[1] ^ w2_t[0] ^ w0_t[2] ^ w0_t[0]), 1u); SHA1_STEP (SHA1_F0o, e, a, b, c, d, w0_t[0]); + w0_t[1] = rotl32 ((w3_t[2] ^ w2_t[1] ^ w0_t[3] ^ w0_t[1]), 1u); SHA1_STEP (SHA1_F0o, d, e, a, b, c, w0_t[1]); + w0_t[2] = rotl32 ((w3_t[3] ^ w2_t[2] ^ w1_t[0] ^ w0_t[2]), 1u); SHA1_STEP (SHA1_F0o, c, d, e, a, b, w0_t[2]); + w0_t[3] = rotl32 ((w0_t[0] ^ w2_t[3] ^ w1_t[1] ^ w0_t[3]), 1u); SHA1_STEP (SHA1_F0o, b, c, d, e, a, w0_t[3]); #undef K #define K SHA1C01 - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w7_t); + w1_t[0] = rotl32 ((w0_t[1] ^ w3_t[0] ^ w1_t[2] ^ w1_t[0]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w1_t[0]); + w1_t[1] = rotl32 ((w0_t[2] ^ w3_t[1] ^ w1_t[3] ^ w1_t[1]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w1_t[1]); + w1_t[2] = rotl32 ((w0_t[3] ^ w3_t[2] ^ w2_t[0] ^ w1_t[2]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w1_t[2]); + w1_t[3] = rotl32 ((w1_t[0] ^ w3_t[3] ^ w2_t[1] ^ w1_t[3]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w1_t[3]); + w2_t[0] = rotl32 ((w1_t[1] ^ w0_t[0] ^ w2_t[2] ^ w2_t[0]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w2_t[0]); + w2_t[1] = rotl32 ((w1_t[2] ^ w0_t[1] ^ w2_t[3] ^ w2_t[1]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w2_t[1]); + w2_t[2] = rotl32 ((w1_t[3] ^ w0_t[2] ^ w3_t[0] ^ w2_t[2]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w2_t[2]); + w2_t[3] = rotl32 ((w2_t[0] ^ w0_t[3] ^ w3_t[1] ^ w2_t[3]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w2_t[3]); + w3_t[0] = rotl32 ((w2_t[1] ^ w1_t[0] ^ w3_t[2] ^ w3_t[0]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w3_t[0]); + w3_t[1] = rotl32 ((w2_t[2] ^ w1_t[1] ^ w3_t[3] ^ w3_t[1]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w3_t[1]); + w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w3_t[2]); + w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w3_t[3]); + w0_t[0] = rotl32 ((w3_t[1] ^ w2_t[0] ^ w0_t[2] ^ w0_t[0]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w0_t[0]); + w0_t[1] = rotl32 ((w3_t[2] ^ w2_t[1] ^ w0_t[3] ^ w0_t[1]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w0_t[1]); + w0_t[2] = rotl32 ((w3_t[3] ^ w2_t[2] ^ w1_t[0] ^ w0_t[2]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w0_t[2]); + w0_t[3] = rotl32 ((w0_t[0] ^ w2_t[3] ^ w1_t[1] ^ w0_t[3]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w0_t[3]); + w1_t[0] = rotl32 ((w0_t[1] ^ w3_t[0] ^ w1_t[2] ^ w1_t[0]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w1_t[0]); + w1_t[1] = rotl32 ((w0_t[2] ^ w3_t[1] ^ w1_t[3] ^ w1_t[1]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w1_t[1]); + w1_t[2] = rotl32 ((w0_t[3] ^ w3_t[2] ^ w2_t[0] ^ w1_t[2]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w1_t[2]); + w1_t[3] = rotl32 ((w1_t[0] ^ w3_t[3] ^ w2_t[1] ^ w1_t[3]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w1_t[3]); #undef K #define K SHA1C02 - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, wb_t); - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, wb_t); + w2_t[0] = rotl32 ((w1_t[1] ^ w0_t[0] ^ w2_t[2] ^ w2_t[0]), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, w2_t[0]); + w2_t[1] = rotl32 ((w1_t[2] ^ w0_t[1] ^ w2_t[3] ^ w2_t[1]), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, w2_t[1]); + w2_t[2] = rotl32 ((w1_t[3] ^ w0_t[2] ^ w3_t[0] ^ w2_t[2]), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, w2_t[2]); + w2_t[3] = rotl32 ((w2_t[0] ^ w0_t[3] ^ w3_t[1] ^ w2_t[3]), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, w2_t[3]); + w3_t[0] = rotl32 ((w2_t[1] ^ w1_t[0] ^ w3_t[2] ^ w3_t[0]), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, w3_t[0]); + w3_t[1] = rotl32 ((w2_t[2] ^ w1_t[1] ^ w3_t[3] ^ w3_t[1]), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, w3_t[1]); + w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, w3_t[2]); + w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, w3_t[3]); + w0_t[0] = rotl32 ((w3_t[1] ^ w2_t[0] ^ w0_t[2] ^ w0_t[0]), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, w0_t[0]); + w0_t[1] = rotl32 ((w3_t[2] ^ w2_t[1] ^ w0_t[3] ^ w0_t[1]), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, w0_t[1]); + w0_t[2] = rotl32 ((w3_t[3] ^ w2_t[2] ^ w1_t[0] ^ w0_t[2]), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, w0_t[2]); + w0_t[3] = rotl32 ((w0_t[0] ^ w2_t[3] ^ w1_t[1] ^ w0_t[3]), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, w0_t[3]); + w1_t[0] = rotl32 ((w0_t[1] ^ w3_t[0] ^ w1_t[2] ^ w1_t[0]), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, w1_t[0]); + w1_t[1] = rotl32 ((w0_t[2] ^ w3_t[1] ^ w1_t[3] ^ w1_t[1]), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, w1_t[1]); + w1_t[2] = rotl32 ((w0_t[3] ^ w3_t[2] ^ w2_t[0] ^ w1_t[2]), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, w1_t[2]); + w1_t[3] = rotl32 ((w1_t[0] ^ w3_t[3] ^ w2_t[1] ^ w1_t[3]), 1u); SHA1_STEP (SHA1_F2o, a, b, c, d, e, w1_t[3]); + w2_t[0] = rotl32 ((w1_t[1] ^ w0_t[0] ^ w2_t[2] ^ w2_t[0]), 1u); SHA1_STEP (SHA1_F2o, e, a, b, c, d, w2_t[0]); + w2_t[1] = rotl32 ((w1_t[2] ^ w0_t[1] ^ w2_t[3] ^ w2_t[1]), 1u); SHA1_STEP (SHA1_F2o, d, e, a, b, c, w2_t[1]); + w2_t[2] = rotl32 ((w1_t[3] ^ w0_t[2] ^ w3_t[0] ^ w2_t[2]), 1u); SHA1_STEP (SHA1_F2o, c, d, e, a, b, w2_t[2]); + w2_t[3] = rotl32 ((w2_t[0] ^ w0_t[3] ^ w3_t[1] ^ w2_t[3]), 1u); SHA1_STEP (SHA1_F2o, b, c, d, e, a, w2_t[3]); #undef K #define K SHA1C03 - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, wf_t); - w0_t = rotl32 ((wd_t ^ w8_t ^ w2_t ^ w0_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w0_t); - w1_t = rotl32 ((we_t ^ w9_t ^ w3_t ^ w1_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w1_t); - w2_t = rotl32 ((wf_t ^ wa_t ^ w4_t ^ w2_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w2_t); - w3_t = rotl32 ((w0_t ^ wb_t ^ w5_t ^ w3_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w3_t); - w4_t = rotl32 ((w1_t ^ wc_t ^ w6_t ^ w4_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w4_t); - w5_t = rotl32 ((w2_t ^ wd_t ^ w7_t ^ w5_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w5_t); - w6_t = rotl32 ((w3_t ^ we_t ^ w8_t ^ w6_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w6_t); - w7_t = rotl32 ((w4_t ^ wf_t ^ w9_t ^ w7_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w7_t); - w8_t = rotl32 ((w5_t ^ w0_t ^ wa_t ^ w8_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w8_t); - w9_t = rotl32 ((w6_t ^ w1_t ^ wb_t ^ w9_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w9_t); - wa_t = rotl32 ((w7_t ^ w2_t ^ wc_t ^ wa_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wa_t); - wb_t = rotl32 ((w8_t ^ w3_t ^ wd_t ^ wb_t), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, wb_t); + w3_t[0] = rotl32 ((w2_t[1] ^ w1_t[0] ^ w3_t[2] ^ w3_t[0]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w3_t[0]); + w3_t[1] = rotl32 ((w2_t[2] ^ w1_t[1] ^ w3_t[3] ^ w3_t[1]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w3_t[1]); + w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w3_t[2]); + w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w3_t[3]); + w0_t[0] = rotl32 ((w3_t[1] ^ w2_t[0] ^ w0_t[2] ^ w0_t[0]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w0_t[0]); + w0_t[1] = rotl32 ((w3_t[2] ^ w2_t[1] ^ w0_t[3] ^ w0_t[1]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w0_t[1]); + w0_t[2] = rotl32 ((w3_t[3] ^ w2_t[2] ^ w1_t[0] ^ w0_t[2]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w0_t[2]); + w0_t[3] = rotl32 ((w0_t[0] ^ w2_t[3] ^ w1_t[1] ^ w0_t[3]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w0_t[3]); + w1_t[0] = rotl32 ((w0_t[1] ^ w3_t[0] ^ w1_t[2] ^ w1_t[0]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w1_t[0]); + w1_t[1] = rotl32 ((w0_t[2] ^ w3_t[1] ^ w1_t[3] ^ w1_t[1]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w1_t[1]); + w1_t[2] = rotl32 ((w0_t[3] ^ w3_t[2] ^ w2_t[0] ^ w1_t[2]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w1_t[2]); + w1_t[3] = rotl32 ((w1_t[0] ^ w3_t[3] ^ w2_t[1] ^ w1_t[3]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w1_t[3]); + w2_t[0] = rotl32 ((w1_t[1] ^ w0_t[0] ^ w2_t[2] ^ w2_t[0]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w2_t[0]); + w2_t[1] = rotl32 ((w1_t[2] ^ w0_t[1] ^ w2_t[3] ^ w2_t[1]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w2_t[1]); + w2_t[2] = rotl32 ((w1_t[3] ^ w0_t[2] ^ w3_t[0] ^ w2_t[2]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w2_t[2]); + w2_t[3] = rotl32 ((w2_t[0] ^ w0_t[3] ^ w3_t[1] ^ w2_t[3]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w2_t[3]); - if (allx (e != e_rev)) continue; + if (MATCHES_NONE_VS (e, e_rev)) continue; - wc_t = rotl32 ((w9_t ^ w4_t ^ we_t ^ wc_t), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wc_t); - wd_t = rotl32 ((wa_t ^ w5_t ^ wf_t ^ wd_t), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wd_t); - we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we_t); - wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf_t); + w3_t[0] = rotl32 ((w2_t[1] ^ w1_t[0] ^ w3_t[2] ^ w3_t[0]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w3_t[0]); + w3_t[1] = rotl32 ((w2_t[2] ^ w1_t[1] ^ w3_t[3] ^ w3_t[1]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w3_t[1]); + w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w3_t[2]); + w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w3_t[3]); - const u32 r0 = d; - const u32 r1 = e; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_S + COMPARE_S_SIMD (d, e, c, b); } } + __kernel void m04700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** diff --git a/OpenCL/m04800_a0.cl b/OpenCL/m04800_a0.cl index 695ef8843..145d1ad88 100644 --- a/OpenCL/m04800_a0.cl +++ b/OpenCL/m04800_a0.cl @@ -138,7 +138,7 @@ __kernel void m04800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, out_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, out_len); w0[0] |= s0[0]; w0[1] |= s0[1]; @@ -164,7 +164,7 @@ __kernel void m04800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * add id byte */ - switch_buffer_by_offset (w0, w1, w2, w3, 1); + switch_buffer_by_offset_le (w0, w1, w2, w3, 1); w0[0] |= salt_buf[4]; @@ -392,7 +392,7 @@ __kernel void m04800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, out_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, out_len); w0[0] |= s0[0]; w0[1] |= s0[1]; @@ -418,7 +418,7 @@ __kernel void m04800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * add id byte */ - switch_buffer_by_offset (w0, w1, w2, w3, 1); + switch_buffer_by_offset_le (w0, w1, w2, w3, 1); w0[0] |= salt_buf[4]; diff --git a/OpenCL/m04800_a1.cl b/OpenCL/m04800_a1.cl index fac916902..cc09dfb36 100644 --- a/OpenCL/m04800_a1.cl +++ b/OpenCL/m04800_a1.cl @@ -68,7 +68,7 @@ __kernel void m04800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -125,7 +125,7 @@ __kernel void m04800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -160,7 +160,7 @@ __kernel void m04800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * add id byte */ - switch_buffer_by_offset (w0, w1, w2, w3, 1); + switch_buffer_by_offset_le (w0, w1, w2, w3, 1); w0[0] |= salt_buf[4]; @@ -196,7 +196,7 @@ __kernel void m04800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, 1 + pw_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, 1 + pw_len); const u32 pw_salt_len = pw_len + salt_len; @@ -359,7 +359,7 @@ __kernel void m04800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -428,7 +428,7 @@ __kernel void m04800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -463,7 +463,7 @@ __kernel void m04800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * add id byte */ - switch_buffer_by_offset (w0, w1, w2, w3, 1); + switch_buffer_by_offset_le (w0, w1, w2, w3, 1); w0[0] |= salt_buf[4]; @@ -499,7 +499,7 @@ __kernel void m04800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, 1 + pw_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, 1 + pw_len); const u32 pw_salt_len = pw_len + salt_len; diff --git a/OpenCL/m04800_a3.cl b/OpenCL/m04800_a3.cl index 96c35360f..6bf5ce025 100644 --- a/OpenCL/m04800_a3.cl +++ b/OpenCL/m04800_a3.cl @@ -5,6 +5,8 @@ #define _MD5_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,9 +18,7 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" - -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" +#include "OpenCL/simd.c" static void m04800m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { @@ -75,37 +75,37 @@ static void m04800m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le // move w by 1 - u32 w0_t[4]; + u32x w0_t[4]; w0_t[0] = w0[0]; w0_t[1] = w0[1]; w0_t[2] = w0[2]; w0_t[3] = w0[3]; - u32 w1_t[4]; + u32x w1_t[4]; w1_t[0] = w1[0]; w1_t[1] = w1[1]; w1_t[2] = w1[2]; w1_t[3] = w1[3]; - u32 w2_t[4]; + u32x w2_t[4]; w2_t[0] = 0; w2_t[1] = 0; w2_t[2] = 0; w2_t[3] = 0; - u32 w3_t[4]; + u32x w3_t[4]; w3_t[0] = 0; w3_t[1] = 0; w3_t[2] = 0; w3_t[3] = 0; - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, 1); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, 1); - switch_buffer_by_offset (s0, s1, s2, s3, 1 + pw_len); + switch_buffer_by_offset_le_S (s0, s1, s2, s3, 1 + pw_len); w0_t[0] |= s0[0]; w0_t[1] |= s0[1]; @@ -137,12 +137,12 @@ static void m04800m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * loop */ - u32 w0l = w0_t[0]; - u32 w1l = w0_t[1]; + u32x w0l = w0_t[0]; + u32x w1l = w0_t[1]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); w0_t[0] = w0l | (w0r << 8); w0_t[1] = w1l | (w0r >> 24); @@ -151,10 +151,10 @@ static void m04800m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * md5 */ - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32x a = MD5M_A; + u32x b = MD5M_B; + u32x c = MD5M_C; + u32x d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); @@ -224,12 +224,7 @@ static void m04800m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - const u32 r0 = a; - const u32 r1 = d; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_M + COMPARE_M_SIMD (a, d, c, b); } } @@ -288,37 +283,37 @@ static void m04800s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le // move w by 1 - u32 w0_t[4]; + u32x w0_t[4]; w0_t[0] = w0[0]; w0_t[1] = w0[1]; w0_t[2] = w0[2]; w0_t[3] = w0[3]; - u32 w1_t[4]; + u32x w1_t[4]; w1_t[0] = w1[0]; w1_t[1] = w1[1]; w1_t[2] = w1[2]; w1_t[3] = w1[3]; - u32 w2_t[4]; + u32x w2_t[4]; w2_t[0] = 0; w2_t[1] = 0; w2_t[2] = 0; w2_t[3] = 0; - u32 w3_t[4]; + u32x w3_t[4]; w3_t[0] = 0; w3_t[1] = 0; w3_t[2] = 0; w3_t[3] = 0; - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, 1); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, 1); - switch_buffer_by_offset (s0, s1, s2, s3, 1 + pw_len); + switch_buffer_by_offset_le_S (s0, s1, s2, s3, 1 + pw_len); w0_t[0] |= s0[0]; w0_t[1] |= s0[1]; @@ -362,12 +357,12 @@ static void m04800s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * loop */ - u32 w0l = w0_t[0]; - u32 w1l = w0_t[1]; + u32x w0l = w0_t[0]; + u32x w1l = w0_t[1]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); w0_t[0] = w0l | (w0r << 8); w0_t[1] = w1l | (w0r >> 24); @@ -376,10 +371,10 @@ static void m04800s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * md5 */ - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32x a = MD5M_A; + u32x b = MD5M_B; + u32x c = MD5M_C; + u32x d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); @@ -446,20 +441,13 @@ static void m04800s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le MD5_STEP (MD5_I , b, c, d, a, w3_t[1], MD5C3b, MD5S33); MD5_STEP (MD5_I , a, b, c, d, w1_t[0], MD5C3c, MD5S30); - bool q_cond = allx (search[0] != a); - - if (q_cond) continue; + if (MATCHES_NONE_VS (a, search[0])) continue; MD5_STEP (MD5_I , d, a, b, c, w2_t[3], MD5C3d, MD5S31); MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - const u32 r0 = a; - const u32 r1 = d; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_S + COMPARE_S_SIMD (a, d, c, b); } } diff --git a/OpenCL/m04900_a0.cl b/OpenCL/m04900_a0.cl index cc7537643..e53112e90 100644 --- a/OpenCL/m04900_a0.cl +++ b/OpenCL/m04900_a0.cl @@ -128,7 +128,7 @@ __kernel void m04900_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * prepend salt */ - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w0_t[0] |= salt_buf0[0]; w0_t[1] |= salt_buf0[1]; @@ -179,7 +179,7 @@ __kernel void m04900_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, salt_len + out_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, salt_len + out_len); w0_t[0] |= s0[0]; w0_t[1] |= s0[1]; @@ -466,7 +466,7 @@ __kernel void m04900_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * prepend salt */ - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w0_t[0] |= salt_buf0[0]; w0_t[1] |= salt_buf0[1]; @@ -517,7 +517,7 @@ __kernel void m04900_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, salt_len + out_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, salt_len + out_len); w0_t[0] |= s0[0]; w0_t[1] |= s0[1]; diff --git a/OpenCL/m04900_a1.cl b/OpenCL/m04900_a1.cl index 587a23b56..80b33877e 100644 --- a/OpenCL/m04900_a1.cl +++ b/OpenCL/m04900_a1.cl @@ -68,7 +68,7 @@ __kernel void m04900_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -145,7 +145,7 @@ __kernel void m04900_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0_t[4]; @@ -180,7 +180,7 @@ __kernel void m04900_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * prepend salt */ - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w0_t[0] |= salt_buf0[0]; w0_t[1] |= salt_buf0[1]; @@ -231,7 +231,7 @@ __kernel void m04900_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, salt_len + pw_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, salt_len + pw_len); w0_t[0] |= s0[0]; w0_t[1] |= s0[1]; @@ -442,7 +442,7 @@ __kernel void m04900_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -537,7 +537,7 @@ __kernel void m04900_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0_t[4]; @@ -572,7 +572,7 @@ __kernel void m04900_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * prepend salt */ - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w0_t[0] |= salt_buf0[0]; w0_t[1] |= salt_buf0[1]; @@ -623,7 +623,7 @@ __kernel void m04900_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, salt_len + pw_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, salt_len + pw_len); w0_t[0] |= s0[0]; w0_t[1] |= s0[1]; diff --git a/OpenCL/m04900_a3.cl b/OpenCL/m04900_a3.cl index 8c3ffb64d..69e1fdafc 100644 --- a/OpenCL/m04900_a3.cl +++ b/OpenCL/m04900_a3.cl @@ -5,6 +5,8 @@ #define _SHA1_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,9 +18,7 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" - -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" +#include "OpenCL/simd.c" static void m04900m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { @@ -67,7 +67,7 @@ static void m04900m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le // first we need to switch the right-hand salt to the correct position (2nd salt) - switch_buffer_by_offset (salt_buf0_t, salt_buf1_t, salt_buf2_t, salt_buf3_t, salt_len + pw_len); + switch_buffer_by_offset_le_S (salt_buf0_t, salt_buf1_t, salt_buf2_t, salt_buf3_t, salt_len + pw_len); u32 salt_buf0[4]; @@ -119,7 +119,7 @@ static void m04900m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le salt_buf3[2] |= salt_buf3_t[2]; salt_buf3[3] |= salt_buf3_t[3]; - append_0x80_4x4 (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_salt_len); + append_0x80_4x4_S (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_salt_len); /** * loop @@ -127,34 +127,34 @@ static void m04900m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - u32 w0_t[4]; + u32x w0_t[4]; - w0_t[0] = w0[0]; + w0_t[0] = w0lr; w0_t[1] = w0[1]; w0_t[2] = w0[2]; w0_t[3] = w0[3]; - u32 w1_t[4]; + u32x w1_t[4]; w1_t[0] = w1[0]; w1_t[1] = w1[1]; w1_t[2] = w1[2]; w1_t[3] = w1[3]; - u32 w2_t[4]; + u32x w2_t[4]; w2_t[0] = w2[0]; w2_t[1] = w2[1]; w2_t[2] = w2[2]; w2_t[3] = w2[3]; - u32 w3_t[4]; + u32x w3_t[4]; w3_t[0] = w3[0]; w3_t[1] = w3[1]; @@ -165,7 +165,7 @@ static void m04900m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * put the password after the first salt but before the second salt */ - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w0_t[0] |= salt_buf0[0]; w0_t[1] |= salt_buf0[1]; @@ -183,32 +183,32 @@ static void m04900m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w3_t[1] |= salt_buf3[1]; w3_t[2] |= salt_buf3[2]; - u32 w0 = swap32 (w0_t[0]); - u32 w1 = swap32 (w0_t[1]); - u32 w2 = swap32 (w0_t[2]); - u32 w3 = swap32 (w0_t[3]); - u32 w4 = swap32 (w1_t[0]); - u32 w5 = swap32 (w1_t[1]); - u32 w6 = swap32 (w1_t[2]); - u32 w7 = swap32 (w1_t[3]); - u32 w8 = swap32 (w2_t[0]); - u32 w9 = swap32 (w2_t[1]); - u32 wa = swap32 (w2_t[2]); - u32 wb = swap32 (w2_t[3]); - u32 wc = swap32 (w3_t[0]); - u32 wd = swap32 (w3_t[1]); - u32 we = swap32 (w3_t[2]); - u32 wf = pw_salt_len * 8; + u32x w0 = swap32 (w0_t[0]); + u32x w1 = swap32 (w0_t[1]); + u32x w2 = swap32 (w0_t[2]); + u32x w3 = swap32 (w0_t[3]); + u32x w4 = swap32 (w1_t[0]); + u32x w5 = swap32 (w1_t[1]); + u32x w6 = swap32 (w1_t[2]); + u32x w7 = swap32 (w1_t[3]); + u32x w8 = swap32 (w2_t[0]); + u32x w9 = swap32 (w2_t[1]); + u32x wa = swap32 (w2_t[2]); + u32x wb = swap32 (w2_t[3]); + u32x wc = swap32 (w3_t[0]); + u32x wd = swap32 (w3_t[1]); + u32x we = swap32 (w3_t[2]); + u32x wf = pw_salt_len * 8; /** * sha1 */ - u32 a = SHA1M_A; - u32 b = SHA1M_B; - u32 c = SHA1M_C; - u32 d = SHA1M_D; - u32 e = SHA1M_E; + u32x a = SHA1M_A; + u32x b = SHA1M_B; + u32x c = SHA1M_C; + u32x d = SHA1M_D; + u32x e = SHA1M_E; #undef K #define K SHA1C00 @@ -306,12 +306,7 @@ static void m04900m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le we = rotl32 ((wb ^ w6 ^ w0 ^ we), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we); wf = rotl32 ((wc ^ w7 ^ w1 ^ wf), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf); - const u32 r0 = d; - const u32 r1 = e; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_M + COMPARE_M_SIMD (d, e, c, b); } } @@ -340,7 +335,7 @@ static void m04900s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * reverse */ - const u32 e_rev = rotl32 (search[1], 2u); + const u32 e_rev = rotl32_S (search[1], 2u); /** * salt @@ -380,7 +375,7 @@ static void m04900s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le // first we need to switch the right-hand salt to the correct position (2nd salt) - switch_buffer_by_offset (salt_buf0_t, salt_buf1_t, salt_buf2_t, salt_buf3_t, salt_len + pw_len); + switch_buffer_by_offset_le_S (salt_buf0_t, salt_buf1_t, salt_buf2_t, salt_buf3_t, salt_len + pw_len); u32 salt_buf0[4]; @@ -432,7 +427,7 @@ static void m04900s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le salt_buf3[2] |= salt_buf3_t[2]; salt_buf3[3] |= salt_buf3_t[3]; - append_0x80_4x4 (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_salt_len); + append_0x80_4x4_S (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_salt_len); /** * loop @@ -440,34 +435,34 @@ static void m04900s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - u32 w0_t[4]; + u32x w0_t[4]; - w0_t[0] = w0[0]; + w0_t[0] = w0lr; w0_t[1] = w0[1]; w0_t[2] = w0[2]; w0_t[3] = w0[3]; - u32 w1_t[4]; + u32x w1_t[4]; w1_t[0] = w1[0]; w1_t[1] = w1[1]; w1_t[2] = w1[2]; w1_t[3] = w1[3]; - u32 w2_t[4]; + u32x w2_t[4]; w2_t[0] = w2[0]; w2_t[1] = w2[1]; w2_t[2] = w2[2]; w2_t[3] = w2[3]; - u32 w3_t[4]; + u32x w3_t[4]; w3_t[0] = w3[0]; w3_t[1] = w3[1]; @@ -478,7 +473,7 @@ static void m04900s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * put the password after the first salt but before the second salt */ - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w0_t[0] |= salt_buf0[0]; w0_t[1] |= salt_buf0[1]; @@ -496,32 +491,32 @@ static void m04900s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w3_t[1] |= salt_buf3[1]; w3_t[2] |= salt_buf3[2]; - u32 w0 = swap32 (w0_t[0]); - u32 w1 = swap32 (w0_t[1]); - u32 w2 = swap32 (w0_t[2]); - u32 w3 = swap32 (w0_t[3]); - u32 w4 = swap32 (w1_t[0]); - u32 w5 = swap32 (w1_t[1]); - u32 w6 = swap32 (w1_t[2]); - u32 w7 = swap32 (w1_t[3]); - u32 w8 = swap32 (w2_t[0]); - u32 w9 = swap32 (w2_t[1]); - u32 wa = swap32 (w2_t[2]); - u32 wb = swap32 (w2_t[3]); - u32 wc = swap32 (w3_t[0]); - u32 wd = swap32 (w3_t[1]); - u32 we = swap32 (w3_t[2]); - u32 wf = pw_salt_len * 8; + u32x w0 = swap32 (w0_t[0]); + u32x w1 = swap32 (w0_t[1]); + u32x w2 = swap32 (w0_t[2]); + u32x w3 = swap32 (w0_t[3]); + u32x w4 = swap32 (w1_t[0]); + u32x w5 = swap32 (w1_t[1]); + u32x w6 = swap32 (w1_t[2]); + u32x w7 = swap32 (w1_t[3]); + u32x w8 = swap32 (w2_t[0]); + u32x w9 = swap32 (w2_t[1]); + u32x wa = swap32 (w2_t[2]); + u32x wb = swap32 (w2_t[3]); + u32x wc = swap32 (w3_t[0]); + u32x wd = swap32 (w3_t[1]); + u32x we = swap32 (w3_t[2]); + u32x wf = pw_salt_len * 8; /** * sha1 */ - u32 a = SHA1M_A; - u32 b = SHA1M_B; - u32 c = SHA1M_C; - u32 d = SHA1M_D; - u32 e = SHA1M_E; + u32x a = SHA1M_A; + u32x b = SHA1M_B; + u32x c = SHA1M_C; + u32x d = SHA1M_D; + u32x e = SHA1M_E; #undef K #define K SHA1C00 @@ -615,19 +610,14 @@ static void m04900s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le wa = rotl32 ((w7 ^ w2 ^ wc ^ wa), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wa); wb = rotl32 ((w8 ^ w3 ^ wd ^ wb), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, wb); - if (allx (e != e_rev)) continue; + if (MATCHES_NONE_VS (e, e_rev)) continue; wc = rotl32 ((w9 ^ w4 ^ we ^ wc), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, wc); wd = rotl32 ((wa ^ w5 ^ wf ^ wd), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, wd); we = rotl32 ((wb ^ w6 ^ w0 ^ we), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we); wf = rotl32 ((wc ^ w7 ^ w1 ^ wf), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf); - const u32 r0 = d; - const u32 r1 = e; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_S + COMPARE_S_SIMD (d, e, c, b); } } diff --git a/OpenCL/m05000_a1.cl b/OpenCL/m05000_a1.cl index 71e0a2af6..d64898914 100644 --- a/OpenCL/m05000_a1.cl +++ b/OpenCL/m05000_a1.cl @@ -136,7 +136,7 @@ __kernel void m05000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x01_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -191,7 +191,7 @@ __kernel void m05000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x01_2x4 (wordr0, wordr1, pw_r_len); - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -397,7 +397,7 @@ __kernel void m05000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x01_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -464,7 +464,7 @@ __kernel void m05000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x01_2x4 (wordr0, wordr1, pw_r_len); - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m05000_a3.cl b/OpenCL/m05000_a3.cl index c4eeb1617..eff5dc319 100644 --- a/OpenCL/m05000_a3.cl +++ b/OpenCL/m05000_a3.cl @@ -5,6 +5,8 @@ #define _KECCAK_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,9 +18,7 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" - -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" +#include "OpenCL/simd.c" __constant u64 keccakf_rndc[24] = { @@ -49,8 +49,8 @@ __constant u64 keccakf_rndc[24] = #define Rho_Pi(s) \ { \ - u32 j = keccakf_piln[s]; \ - u32 k = keccakf_rotc[s]; \ + u32 j = keccakf_piln[s]; \ + u32 k = keccakf_rotc[s]; \ bc0 = st[j]; \ st[j] = rotl64 (t, k); \ t = bc0; \ @@ -83,13 +83,13 @@ static void m05000m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * const */ - const u8 keccakf_rotc[24] = + const u32 keccakf_rotc[24] = { 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14, 27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44 }; - const u8 keccakf_piln[24] = + const u32 keccakf_piln[24] = { 10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4, 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1 @@ -111,22 +111,22 @@ static void m05000m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - u64 st[25]; + u64x st[25]; - st[ 0] = (u64) (w0[0]) | (u64) (w0[1]) << 32; - st[ 1] = (u64) (w0[2]) | (u64) (w0[3]) << 32; - st[ 2] = (u64) (w1[0]) | (u64) (w1[1]) << 32; - st[ 3] = (u64) (w1[2]) | (u64) (w1[3]) << 32; - st[ 4] = (u64) (w2[0]) | (u64) (w2[1]) << 32; - st[ 5] = (u64) (w2[2]) | (u64) (w2[3]) << 32; - st[ 6] = (u64) (w3[0]) | (u64) (w3[1]) << 32; - st[ 7] = (u64) (w3[2]) | (u64) (w3[3]) << 32; + st[ 0] = hl32_to_64 (w0[1], w0lr); + st[ 1] = hl32_to_64 (w0[3], w0[2]); + st[ 2] = hl32_to_64 (w1[1], w1[0]); + st[ 3] = hl32_to_64 (w1[3], w1[2]); + st[ 4] = hl32_to_64 (w2[1], w2[0]); + st[ 5] = hl32_to_64 (w2[3], w2[2]); + st[ 6] = hl32_to_64 (w3[1], w3[0]); + st[ 7] = hl32_to_64 (w3[3], w3[2]); st[ 8] = 0; st[ 9] = 0; st[10] = 0; @@ -153,13 +153,13 @@ static void m05000m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le { // Theta - u64 bc0 = Theta1 (0); - u64 bc1 = Theta1 (1); - u64 bc2 = Theta1 (2); - u64 bc3 = Theta1 (3); - u64 bc4 = Theta1 (4); + u64x bc0 = Theta1 (0); + u64x bc1 = Theta1 (1); + u64x bc2 = Theta1 (2); + u64x bc3 = Theta1 (3); + u64x bc4 = Theta1 (4); - u64 t; + u64x t; t = bc4 ^ rotl64 (bc1, 1); Theta2 (0); t = bc0 ^ rotl64 (bc2, 1); Theta2 (1); @@ -209,12 +209,12 @@ static void m05000m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le st[0] ^= keccakf_rndc[round]; } - const u32 r0 = l32_from_64 (st[1]); - const u32 r1 = h32_from_64 (st[1]); - const u32 r2 = l32_from_64 (st[2]); - const u32 r3 = h32_from_64 (st[2]); + const u32x r0 = l32_from_64 (st[1]); + const u32x r1 = h32_from_64 (st[1]); + const u32x r2 = l32_from_64 (st[2]); + const u32x r3 = h32_from_64 (st[2]); - #include COMPARE_M + COMPARE_M_SIMD (r0, r1, r2, r3); } } @@ -231,13 +231,13 @@ static void m05000s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * const */ - const u8 keccakf_rotc[24] = + const u32 keccakf_rotc[24] = { 1, 3, 6, 10, 15, 21, 28, 36, 45, 55, 2, 14, 27, 41, 56, 8, 25, 43, 62, 18, 39, 61, 20, 44 }; - const u8 keccakf_piln[24] = + const u32 keccakf_piln[24] = { 10, 7, 11, 17, 18, 3, 5, 16, 8, 21, 24, 4, 15, 23, 19, 13, 12, 2, 20, 14, 22, 9, 6, 1 @@ -271,22 +271,22 @@ static void m05000s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - u64 st[25]; + u64x st[25]; - st[ 0] = (u64) (w0[0]) | (u64) (w0[1]) << 32; - st[ 1] = (u64) (w0[2]) | (u64) (w0[3]) << 32; - st[ 2] = (u64) (w1[0]) | (u64) (w1[1]) << 32; - st[ 3] = (u64) (w1[2]) | (u64) (w1[3]) << 32; - st[ 4] = (u64) (w2[0]) | (u64) (w2[1]) << 32; - st[ 5] = (u64) (w2[2]) | (u64) (w2[3]) << 32; - st[ 6] = (u64) (w3[0]) | (u64) (w3[1]) << 32; - st[ 7] = (u64) (w3[2]) | (u64) (w3[3]) << 32; + st[ 0] = hl32_to_64 (w0[1], w0lr); + st[ 1] = hl32_to_64 (w0[3], w0[2]); + st[ 2] = hl32_to_64 (w1[1], w1[0]); + st[ 3] = hl32_to_64 (w1[3], w1[2]); + st[ 4] = hl32_to_64 (w2[1], w2[0]); + st[ 5] = hl32_to_64 (w2[3], w2[2]); + st[ 6] = hl32_to_64 (w3[1], w3[0]); + st[ 7] = hl32_to_64 (w3[3], w3[2]); st[ 8] = 0; st[ 9] = 0; st[10] = 0; @@ -313,13 +313,13 @@ static void m05000s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le { // Theta - u64 bc0 = Theta1 (0); - u64 bc1 = Theta1 (1); - u64 bc2 = Theta1 (2); - u64 bc3 = Theta1 (3); - u64 bc4 = Theta1 (4); + u64x bc0 = Theta1 (0); + u64x bc1 = Theta1 (1); + u64x bc2 = Theta1 (2); + u64x bc3 = Theta1 (3); + u64x bc4 = Theta1 (4); - u64 t; + u64x t; t = bc4 ^ rotl64 (bc1, 1); Theta2 (0); t = bc0 ^ rotl64 (bc2, 1); Theta2 (1); @@ -369,12 +369,12 @@ static void m05000s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le st[0] ^= keccakf_rndc[round]; } - const u32 r0 = l32_from_64 (st[1]); - const u32 r1 = h32_from_64 (st[1]); - const u32 r2 = l32_from_64 (st[2]); - const u32 r3 = h32_from_64 (st[2]); + const u32x r0 = l32_from_64 (st[1]); + const u32x r1 = h32_from_64 (st[1]); + const u32x r2 = l32_from_64 (st[2]); + const u32x r3 = h32_from_64 (st[2]); - #include COMPARE_S + COMPARE_S_SIMD (r0, r1, r2, r3); } } diff --git a/OpenCL/m05100_a1.cl b/OpenCL/m05100_a1.cl index 966052060..a8de2f916 100644 --- a/OpenCL/m05100_a1.cl +++ b/OpenCL/m05100_a1.cl @@ -70,7 +70,7 @@ __kernel void m05100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -113,7 +113,7 @@ __kernel void m05100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -321,7 +321,7 @@ __kernel void m05100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -364,7 +364,7 @@ __kernel void m05100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m05100_a3.cl b/OpenCL/m05100_a3.cl index fa2de71fc..4de1ac840 100644 --- a/OpenCL/m05100_a3.cl +++ b/OpenCL/m05100_a3.cl @@ -5,6 +5,8 @@ #define _MD5H_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,9 +18,7 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" - -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" +#include "OpenCL/simd.c" static void m05100m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { @@ -41,18 +41,18 @@ static void m05100m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32x a = MD5M_A; + u32x b = MD5M_B; + u32x c = MD5M_C; + u32x d = MD5M_D; - MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); + MD5_STEP (MD5_Fo, a, b, c, d, w0lr, MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); MD5_STEP (MD5_Fo, c, d, a, b, w0[2], MD5C02, MD5S02); MD5_STEP (MD5_Fo, b, c, d, a, w0[3], MD5C03, MD5S03); @@ -72,7 +72,7 @@ static void m05100m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le MD5_STEP (MD5_Go, a, b, c, d, w0[1], MD5C10, MD5S10); MD5_STEP (MD5_Go, d, a, b, c, w1[2], MD5C11, MD5S11); MD5_STEP (MD5_Go, c, d, a, b, w2[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0[0], MD5C13, MD5S13); + MD5_STEP (MD5_Go, b, c, d, a, w0lr, MD5C13, MD5S13); MD5_STEP (MD5_Go, a, b, c, d, w1[1], MD5C14, MD5S10); MD5_STEP (MD5_Go, d, a, b, c, w2[2], MD5C15, MD5S11); MD5_STEP (MD5_Go, c, d, a, b, w3[3], MD5C16, MD5S12); @@ -95,7 +95,7 @@ static void m05100m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le MD5_STEP (MD5_H , c, d, a, b, w1[3], MD5C26, MD5S22); MD5_STEP (MD5_H , b, c, d, a, w2[2], MD5C27, MD5S23); MD5_STEP (MD5_H , a, b, c, d, w3[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0[0], MD5C29, MD5S21); + MD5_STEP (MD5_H , d, a, b, c, w0lr, MD5C29, MD5S21); MD5_STEP (MD5_H , c, d, a, b, w0[3], MD5C2a, MD5S22); MD5_STEP (MD5_H , b, c, d, a, w1[2], MD5C2b, MD5S23); MD5_STEP (MD5_H , a, b, c, d, w2[1], MD5C2c, MD5S20); @@ -103,7 +103,7 @@ static void m05100m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le MD5_STEP (MD5_H , c, d, a, b, w3[3], MD5C2e, MD5S22); MD5_STEP (MD5_H , b, c, d, a, w0[2], MD5C2f, MD5S23); - MD5_STEP (MD5_I , a, b, c, d, w0[0], MD5C30, MD5S30); + MD5_STEP (MD5_I , a, b, c, d, w0lr, MD5C30, MD5S30); MD5_STEP (MD5_I , d, a, b, c, w1[3], MD5C31, MD5S31); MD5_STEP (MD5_I , c, d, a, b, w3[2], MD5C32, MD5S32); MD5_STEP (MD5_I , b, c, d, a, w1[1], MD5C33, MD5S33); @@ -125,32 +125,14 @@ static void m05100m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le c += MD5M_C; d += MD5M_D; - { - const u32 r0 = a; - const u32 r1 = b; - const u32 r2 = 0; - const u32 r3 = 0; + u32x e = 0; + u32x f = 0; - #include COMPARE_M - } + COMPARE_M_SIMD (a, b, e, f); - { - const u32 r0 = b; - const u32 r1 = c; - const u32 r2 = 0; - const u32 r3 = 0; + COMPARE_M_SIMD (b, c, e, f); - #include COMPARE_M - } - - { - const u32 r0 = c; - const u32 r1 = d; - const u32 r2 = 0; - const u32 r3 = 0; - - #include COMPARE_M - } + COMPARE_M_SIMD (c, d, e, f); } } @@ -187,18 +169,18 @@ static void m05100s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32x a = MD5M_A; + u32x b = MD5M_B; + u32x c = MD5M_C; + u32x d = MD5M_D; - MD5_STEP (MD5_Fo, a, b, c, d, w0[0], MD5C00, MD5S00); + MD5_STEP (MD5_Fo, a, b, c, d, w0lr, MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0[1], MD5C01, MD5S01); MD5_STEP (MD5_Fo, c, d, a, b, w0[2], MD5C02, MD5S02); MD5_STEP (MD5_Fo, b, c, d, a, w0[3], MD5C03, MD5S03); @@ -218,7 +200,7 @@ static void m05100s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le MD5_STEP (MD5_Go, a, b, c, d, w0[1], MD5C10, MD5S10); MD5_STEP (MD5_Go, d, a, b, c, w1[2], MD5C11, MD5S11); MD5_STEP (MD5_Go, c, d, a, b, w2[3], MD5C12, MD5S12); - MD5_STEP (MD5_Go, b, c, d, a, w0[0], MD5C13, MD5S13); + MD5_STEP (MD5_Go, b, c, d, a, w0lr, MD5C13, MD5S13); MD5_STEP (MD5_Go, a, b, c, d, w1[1], MD5C14, MD5S10); MD5_STEP (MD5_Go, d, a, b, c, w2[2], MD5C15, MD5S11); MD5_STEP (MD5_Go, c, d, a, b, w3[3], MD5C16, MD5S12); @@ -241,7 +223,7 @@ static void m05100s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le MD5_STEP (MD5_H , c, d, a, b, w1[3], MD5C26, MD5S22); MD5_STEP (MD5_H , b, c, d, a, w2[2], MD5C27, MD5S23); MD5_STEP (MD5_H , a, b, c, d, w3[1], MD5C28, MD5S20); - MD5_STEP (MD5_H , d, a, b, c, w0[0], MD5C29, MD5S21); + MD5_STEP (MD5_H , d, a, b, c, w0lr, MD5C29, MD5S21); MD5_STEP (MD5_H , c, d, a, b, w0[3], MD5C2a, MD5S22); MD5_STEP (MD5_H , b, c, d, a, w1[2], MD5C2b, MD5S23); MD5_STEP (MD5_H , a, b, c, d, w2[1], MD5C2c, MD5S20); @@ -249,7 +231,7 @@ static void m05100s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le MD5_STEP (MD5_H , c, d, a, b, w3[3], MD5C2e, MD5S22); MD5_STEP (MD5_H , b, c, d, a, w0[2], MD5C2f, MD5S23); - MD5_STEP (MD5_I , a, b, c, d, w0[0], MD5C30, MD5S30); + MD5_STEP (MD5_I , a, b, c, d, w0lr, MD5C30, MD5S30); MD5_STEP (MD5_I , d, a, b, c, w1[3], MD5C31, MD5S31); MD5_STEP (MD5_I , c, d, a, b, w3[2], MD5C32, MD5S32); MD5_STEP (MD5_I , b, c, d, a, w1[1], MD5C33, MD5S33); @@ -271,32 +253,14 @@ static void m05100s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le c += MD5M_C; d += MD5M_D; - { - const u32 r0 = a; - const u32 r1 = b; - const u32 r2 = 0; - const u32 r3 = 0; + u32x e = 0; + u32x f = 0; - #include COMPARE_S - } + COMPARE_S_SIMD (a, b, e, f); - { - const u32 r0 = b; - const u32 r1 = c; - const u32 r2 = 0; - const u32 r3 = 0; + COMPARE_S_SIMD (b, c, e, f); - #include COMPARE_S - } - - { - const u32 r0 = c; - const u32 r1 = d; - const u32 r2 = 0; - const u32 r3 = 0; - - #include COMPARE_S - } + COMPARE_S_SIMD (c, d, e, f); } } diff --git a/OpenCL/m05200.cl b/OpenCL/m05200.cl index ef1ba257d..6b992b4a1 100644 --- a/OpenCL/m05200.cl +++ b/OpenCL/m05200.cl @@ -200,7 +200,7 @@ __kernel void m05200_init (__global pw_t *pws, __global kernel_rule_t *rules_buf u32 salt_len = salt_bufs[salt_pos].salt_len; - switch_buffer_by_offset (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len); + switch_buffer_by_offset_le (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len); w0[0] |= salt_buf0[0]; w0[1] |= salt_buf0[1]; diff --git a/OpenCL/m05300_a1.cl b/OpenCL/m05300_a1.cl index e8ddc4b44..5b72c033e 100644 --- a/OpenCL/m05300_a1.cl +++ b/OpenCL/m05300_a1.cl @@ -280,7 +280,7 @@ __kernel void m05300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -323,7 +323,7 @@ __kernel void m05300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -569,7 +569,7 @@ __kernel void m05300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -624,7 +624,7 @@ __kernel void m05300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m05300_a3.cl b/OpenCL/m05300_a3.cl index 488d21880..60df34257 100644 --- a/OpenCL/m05300_a3.cl +++ b/OpenCL/m05300_a3.cl @@ -5,6 +5,8 @@ #define _MD5_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,33 +18,31 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" +#include "OpenCL/simd.c" -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" - -static void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) +static void md5_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[4]) { - u32 a = digest[0]; - u32 b = digest[1]; - u32 c = digest[2]; - u32 d = digest[3]; + u32x a = digest[0]; + u32x b = digest[1]; + u32x c = digest[2]; + u32x d = digest[3]; - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; + u32x w0_t = w0[0]; + u32x w1_t = w0[1]; + u32x w2_t = w0[2]; + u32x w3_t = w0[3]; + u32x w4_t = w1[0]; + u32x w5_t = w1[1]; + u32x w6_t = w1[2]; + u32x w7_t = w1[3]; + u32x w8_t = w2[0]; + u32x w9_t = w2[1]; + u32x wa_t = w2[2]; + u32x wb_t = w2[3]; + u32x wc_t = w3[0]; + u32x wd_t = w3[1]; + u32x we_t = w3[2]; + u32x wf_t = w3[3]; MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); @@ -118,7 +118,7 @@ static void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], co digest[3] += d; } -static void hmac_md5_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[4], u32 opad[4]) +static void hmac_md5_pad (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[4], u32x opad[4]) { w0[0] = w0[0] ^ 0x36363636; w0[1] = w0[1] ^ 0x36363636; @@ -169,7 +169,7 @@ static void hmac_md5_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[4 md5_transform (w0, w1, w2, w3, opad); } -static void hmac_md5_run (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[4], u32 opad[4], u32 digest[4]) +static void hmac_md5_run (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[4], u32x opad[4], u32x digest[4]) { digest[0] = ipad[0]; digest[1] = ipad[1]; @@ -225,46 +225,46 @@ static void m05300m (__local u32 w_s[16], u32 w0[4], u32 w1[4], u32 w2[4], u32 w u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; /** * pads */ - u32 w0_t[4]; + u32x w0_t[4]; - w0_t[0] = w0[0]; + w0_t[0] = w0lr; w0_t[1] = w0[1]; w0_t[2] = w0[2]; w0_t[3] = w0[3]; - u32 w1_t[4]; + u32x w1_t[4]; w1_t[0] = w1[0]; w1_t[1] = w1[1]; w1_t[2] = w1[2]; w1_t[3] = w1[3]; - u32 w2_t[4]; + u32x w2_t[4]; w2_t[0] = w2[0]; w2_t[1] = w2[1]; w2_t[2] = w2[2]; w2_t[3] = w2[3]; - u32 w3_t[4]; + u32x w3_t[4]; w3_t[0] = w3[0]; w3_t[1] = w3[1]; w3_t[2] = w3[2]; w3_t[3] = w3[3]; - u32 ipad[4]; - u32 opad[4]; + u32x ipad[4]; + u32x opad[4]; hmac_md5_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); @@ -285,7 +285,7 @@ static void m05300m (__local u32 w_s[16], u32 w0[4], u32 w1[4], u32 w2[4], u32 w w3_t[2] = (64 + nr_len) * 8; w3_t[3] = 0; - u32 digest[4]; + u32x digest[4]; hmac_md5_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); @@ -352,12 +352,7 @@ static void m05300m (__local u32 w_s[16], u32 w0[4], u32 w1[4], u32 w2[4], u32 w hmac_md5_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); - const u32 r0 = digest[0]; - const u32 r1 = digest[3]; - const u32 r2 = digest[2]; - const u32 r3 = digest[1]; - - #include COMPARE_M + COMPARE_M_SIMD (digest[0], digest[3], digest[2], digest[1]); } } @@ -395,46 +390,46 @@ static void m05300s (__local u32 w_s[16], u32 w0[4], u32 w1[4], u32 w2[4], u32 w u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; /** * pads */ - u32 w0_t[4]; + u32x w0_t[4]; - w0_t[0] = w0[0]; + w0_t[0] = w0lr; w0_t[1] = w0[1]; w0_t[2] = w0[2]; w0_t[3] = w0[3]; - u32 w1_t[4]; + u32x w1_t[4]; w1_t[0] = w1[0]; w1_t[1] = w1[1]; w1_t[2] = w1[2]; w1_t[3] = w1[3]; - u32 w2_t[4]; + u32x w2_t[4]; w2_t[0] = w2[0]; w2_t[1] = w2[1]; w2_t[2] = w2[2]; w2_t[3] = w2[3]; - u32 w3_t[4]; + u32x w3_t[4]; w3_t[0] = w3[0]; w3_t[1] = w3[1]; w3_t[2] = w3[2]; w3_t[3] = w3[3]; - u32 ipad[4]; - u32 opad[4]; + u32x ipad[4]; + u32x opad[4]; hmac_md5_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); @@ -455,7 +450,7 @@ static void m05300s (__local u32 w_s[16], u32 w0[4], u32 w1[4], u32 w2[4], u32 w w3_t[2] = (64 + nr_len) * 8; w3_t[3] = 0; - u32 digest[4]; + u32x digest[4]; hmac_md5_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); @@ -522,12 +517,7 @@ static void m05300s (__local u32 w_s[16], u32 w0[4], u32 w1[4], u32 w2[4], u32 w hmac_md5_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); - const u32 r0 = digest[0]; - const u32 r1 = digest[3]; - const u32 r2 = digest[2]; - const u32 r3 = digest[1]; - - #include COMPARE_S + COMPARE_S_SIMD (digest[0], digest[3], digest[2], digest[1]); } } diff --git a/OpenCL/m05400_a1.cl b/OpenCL/m05400_a1.cl index 66e73bd94..e4a9f1451 100644 --- a/OpenCL/m05400_a1.cl +++ b/OpenCL/m05400_a1.cl @@ -314,7 +314,7 @@ __kernel void m05400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -357,7 +357,7 @@ __kernel void m05400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -603,7 +603,7 @@ __kernel void m05400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -658,7 +658,7 @@ __kernel void m05400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m05400_a3.cl b/OpenCL/m05400_a3.cl index af976ffe4..2389ba816 100644 --- a/OpenCL/m05400_a3.cl +++ b/OpenCL/m05400_a3.cl @@ -5,6 +5,8 @@ #define _SHA1_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,34 +18,32 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" +#include "OpenCL/simd.c" -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" - -static void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) +static void sha1_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[5]) { - u32 A = digest[0]; - u32 B = digest[1]; - u32 C = digest[2]; - u32 D = digest[3]; - u32 E = digest[4]; + u32x A = digest[0]; + u32x B = digest[1]; + u32x C = digest[2]; + u32x D = digest[3]; + u32x E = digest[4]; - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; + u32x w0_t = w0[0]; + u32x w1_t = w0[1]; + u32x w2_t = w0[2]; + u32x w3_t = w0[3]; + u32x w4_t = w1[0]; + u32x w5_t = w1[1]; + u32x w6_t = w1[2]; + u32x w7_t = w1[3]; + u32x w8_t = w2[0]; + u32x w9_t = w2[1]; + u32x wa_t = w2[2]; + u32x wb_t = w2[3]; + u32x wc_t = w3[0]; + u32x wd_t = w3[1]; + u32x we_t = w3[2]; + u32x wf_t = w3[3]; #undef K #define K SHA1C00 @@ -148,7 +148,7 @@ static void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], c digest[4] += E; } -static void hmac_sha1_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[5], u32 opad[5]) +static void hmac_sha1_pad (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[5], u32x opad[5]) { w0[0] = w0[0] ^ 0x36363636; w0[1] = w0[1] ^ 0x36363636; @@ -201,7 +201,7 @@ static void hmac_sha1_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[ sha1_transform (w0, w1, w2, w3, opad); } -static void hmac_sha1_run (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[5], u32 opad[5], u32 digest[5]) +static void hmac_sha1_run (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[5], u32x opad[5], u32x digest[5]) { digest[0] = ipad[0]; digest[1] = ipad[1]; @@ -259,46 +259,46 @@ static void m05400m (__local u32 w_s[16], u32 w0[4], u32 w1[4], u32 w2[4], u32 w u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; /** * pads */ - u32 w0_t[4]; + u32x w0_t[4]; - w0_t[0] = w0[0]; + w0_t[0] = w0lr; w0_t[1] = w0[1]; w0_t[2] = w0[2]; w0_t[3] = w0[3]; - u32 w1_t[4]; + u32x w1_t[4]; w1_t[0] = w1[0]; w1_t[1] = w1[1]; w1_t[2] = w1[2]; w1_t[3] = w1[3]; - u32 w2_t[4]; + u32x w2_t[4]; w2_t[0] = w2[0]; w2_t[1] = w2[1]; w2_t[2] = w2[2]; w2_t[3] = w2[3]; - u32 w3_t[4]; + u32x w3_t[4]; w3_t[0] = w3[0]; w3_t[1] = w3[1]; w3_t[2] = 0; w3_t[3] = 0; - u32 ipad[5]; - u32 opad[5]; + u32x ipad[5]; + u32x opad[5]; hmac_sha1_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); @@ -319,7 +319,7 @@ static void m05400m (__local u32 w_s[16], u32 w0[4], u32 w1[4], u32 w2[4], u32 w w3_t[2] = 0; w3_t[3] = (64 + nr_len) * 8; - u32 digest[5]; + u32x digest[5]; hmac_sha1_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); @@ -386,12 +386,7 @@ static void m05400m (__local u32 w_s[16], u32 w0[4], u32 w1[4], u32 w2[4], u32 w hmac_sha1_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); - const u32 r0 = digest[3]; - const u32 r1 = digest[4]; - const u32 r2 = digest[2]; - const u32 r3 = digest[1]; - - #include COMPARE_M + COMPARE_M_SIMD (digest[3], digest[4], digest[2], digest[1]); } } @@ -429,46 +424,46 @@ static void m05400s (__local u32 w_s[16], u32 w0[4], u32 w1[4], u32 w2[4], u32 w u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; /** * pads */ - u32 w0_t[4]; + u32x w0_t[4]; - w0_t[0] = w0[0]; + w0_t[0] = w0lr; w0_t[1] = w0[1]; w0_t[2] = w0[2]; w0_t[3] = w0[3]; - u32 w1_t[4]; + u32x w1_t[4]; w1_t[0] = w1[0]; w1_t[1] = w1[1]; w1_t[2] = w1[2]; w1_t[3] = w1[3]; - u32 w2_t[4]; + u32x w2_t[4]; w2_t[0] = w2[0]; w2_t[1] = w2[1]; w2_t[2] = w2[2]; w2_t[3] = w2[3]; - u32 w3_t[4]; + u32x w3_t[4]; w3_t[0] = w3[0]; w3_t[1] = w3[1]; w3_t[2] = 0; w3_t[3] = 0; - u32 ipad[5]; - u32 opad[5]; + u32x ipad[5]; + u32x opad[5]; hmac_sha1_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); @@ -489,7 +484,7 @@ static void m05400s (__local u32 w_s[16], u32 w0[4], u32 w1[4], u32 w2[4], u32 w w3_t[2] = 0; w3_t[3] = (64 + nr_len) * 8; - u32 digest[5]; + u32x digest[5]; hmac_sha1_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); @@ -556,12 +551,7 @@ static void m05400s (__local u32 w_s[16], u32 w0[4], u32 w1[4], u32 w2[4], u32 w hmac_sha1_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); - const u32 r0 = digest[3]; - const u32 r1 = digest[4]; - const u32 r2 = digest[2]; - const u32 r3 = digest[1]; - - #include COMPARE_S + COMPARE_S_SIMD (digest[3], digest[4], digest[2], digest[1]); } } @@ -583,7 +573,7 @@ __kernel void m05400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, for (u32 i = lid; i < 16; i += lsz) { - w_s[i] = swap32 (ikepsk_bufs[salt_pos].nr_buf[i]); + w_s[i] = swap32_S (ikepsk_bufs[salt_pos].nr_buf[i]); } barrier (CLK_LOCAL_MEM_FENCE); @@ -592,7 +582,7 @@ __kernel void m05400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, for (u32 i = lid; i < 128; i += lsz) { - s_msg_buf[i] = swap32 (ikepsk_bufs[salt_pos].msg_buf[i]); + s_msg_buf[i] = swap32_S (ikepsk_bufs[salt_pos].msg_buf[i]); } barrier (CLK_LOCAL_MEM_FENCE); @@ -658,7 +648,7 @@ __kernel void m05400_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, for (u32 i = lid; i < 16; i += lsz) { - w_s[i] = swap32 (ikepsk_bufs[salt_pos].nr_buf[i]); + w_s[i] = swap32_S (ikepsk_bufs[salt_pos].nr_buf[i]); } barrier (CLK_LOCAL_MEM_FENCE); @@ -667,7 +657,7 @@ __kernel void m05400_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, for (u32 i = lid; i < 128; i += lsz) { - s_msg_buf[i] = swap32 (ikepsk_bufs[salt_pos].msg_buf[i]); + s_msg_buf[i] = swap32_S (ikepsk_bufs[salt_pos].msg_buf[i]); } barrier (CLK_LOCAL_MEM_FENCE); @@ -733,7 +723,7 @@ __kernel void m05400_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, for (u32 i = lid; i < 16; i += lsz) { - w_s[i] = swap32 (ikepsk_bufs[salt_pos].nr_buf[i]); + w_s[i] = swap32_S (ikepsk_bufs[salt_pos].nr_buf[i]); } barrier (CLK_LOCAL_MEM_FENCE); @@ -742,7 +732,7 @@ __kernel void m05400_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, for (u32 i = lid; i < 128; i += lsz) { - s_msg_buf[i] = swap32 (ikepsk_bufs[salt_pos].msg_buf[i]); + s_msg_buf[i] = swap32_S (ikepsk_bufs[salt_pos].msg_buf[i]); } barrier (CLK_LOCAL_MEM_FENCE); @@ -808,7 +798,7 @@ __kernel void m05400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, for (u32 i = lid; i < 16; i += lsz) { - w_s[i] = swap32 (ikepsk_bufs[salt_pos].nr_buf[i]); + w_s[i] = swap32_S (ikepsk_bufs[salt_pos].nr_buf[i]); } barrier (CLK_LOCAL_MEM_FENCE); @@ -817,7 +807,7 @@ __kernel void m05400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, for (u32 i = lid; i < 128; i += lsz) { - s_msg_buf[i] = swap32 (ikepsk_bufs[salt_pos].msg_buf[i]); + s_msg_buf[i] = swap32_S (ikepsk_bufs[salt_pos].msg_buf[i]); } barrier (CLK_LOCAL_MEM_FENCE); @@ -883,7 +873,7 @@ __kernel void m05400_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, for (u32 i = lid; i < 16; i += lsz) { - w_s[i] = swap32 (ikepsk_bufs[salt_pos].nr_buf[i]); + w_s[i] = swap32_S (ikepsk_bufs[salt_pos].nr_buf[i]); } barrier (CLK_LOCAL_MEM_FENCE); @@ -892,7 +882,7 @@ __kernel void m05400_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, for (u32 i = lid; i < 128; i += lsz) { - s_msg_buf[i] = swap32 (ikepsk_bufs[salt_pos].msg_buf[i]); + s_msg_buf[i] = swap32_S (ikepsk_bufs[salt_pos].msg_buf[i]); } barrier (CLK_LOCAL_MEM_FENCE); @@ -958,7 +948,7 @@ __kernel void m05400_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, for (u32 i = lid; i < 16; i += lsz) { - w_s[i] = swap32 (ikepsk_bufs[salt_pos].nr_buf[i]); + w_s[i] = swap32_S (ikepsk_bufs[salt_pos].nr_buf[i]); } barrier (CLK_LOCAL_MEM_FENCE); @@ -967,7 +957,7 @@ __kernel void m05400_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, for (u32 i = lid; i < 128; i += lsz) { - s_msg_buf[i] = swap32 (ikepsk_bufs[salt_pos].msg_buf[i]); + s_msg_buf[i] = swap32_S (ikepsk_bufs[salt_pos].msg_buf[i]); } barrier (CLK_LOCAL_MEM_FENCE); diff --git a/OpenCL/m05500_a1.cl b/OpenCL/m05500_a1.cl index daff9ace4..c1fe24e58 100644 --- a/OpenCL/m05500_a1.cl +++ b/OpenCL/m05500_a1.cl @@ -546,7 +546,7 @@ __kernel void m05500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -602,7 +602,7 @@ __kernel void m05500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -840,7 +840,7 @@ __kernel void m05500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -908,7 +908,7 @@ __kernel void m05500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m05500_a3.cl b/OpenCL/m05500_a3.cl index 61053994c..f2016f507 100644 --- a/OpenCL/m05500_a3.cl +++ b/OpenCL/m05500_a3.cl @@ -5,6 +5,8 @@ #define _MD4_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,9 +18,7 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" - -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" +#include "OpenCL/simd.c" #define PERM_OP(a,b,tt,n,m) \ { \ @@ -336,18 +336,26 @@ __constant u32 c_skb[8][64] = } }; +#if VECT_SIZE == 1 #define BOX(i,n,S) (S)[(n)][(i)] +#elif VECT_SIZE == 2 +#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1]) +#elif VECT_SIZE == 4 +#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3]) +#elif VECT_SIZE == 8 +#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7]) +#endif -static void _des_crypt_encrypt (u32 iv[2], u32 data[2], u32 Kc[16], u32 Kd[16], __local u32 s_SPtrans[8][64]) +static void _des_crypt_encrypt (u32x iv[2], u32x data[2], u32x Kc[16], u32x Kd[16], __local u32 s_SPtrans[8][64]) { - u32 r = data[0]; - u32 l = data[1]; + u32x r = data[0]; + u32x l = data[1]; #pragma unroll 16 for (u32 i = 0; i < 16; i += 2) { - u32 u; - u32 t; + u32x u; + u32x t; u = Kc[i + 0] ^ rotl32 (r, 30u); t = Kd[i + 0] ^ rotl32 (r, 26u); @@ -378,9 +386,9 @@ static void _des_crypt_encrypt (u32 iv[2], u32 data[2], u32 Kc[16], u32 Kd[16], iv[1] = r; } -static void _des_crypt_keysetup (u32 c, u32 d, u32 Kc[16], u32 Kd[16], __local u32 s_skb[8][64]) +static void _des_crypt_keysetup (u32x c, u32x d, u32x Kc[16], u32x Kd[16], __local u32 s_skb[8][64]) { - u32 tt; + u32x tt; PERM_OP (d, c, tt, 4, 0x0f0f0f0f); HPERM_OP (c, tt, 2, 0xcccc0000); @@ -413,60 +421,74 @@ static void _des_crypt_keysetup (u32 c, u32 d, u32 Kc[16], u32 Kd[16], __local u c = c & 0x0fffffff; d = d & 0x0fffffff; - const u32 c00 = (c >> 0) & 0x0000003f; - const u32 c06 = (c >> 6) & 0x00383003; - const u32 c07 = (c >> 7) & 0x0000003c; - const u32 c13 = (c >> 13) & 0x0000060f; - const u32 c20 = (c >> 20) & 0x00000001; + const u32x c00 = (c >> 0) & 0x0000003f; + const u32x c06 = (c >> 6) & 0x00383003; + const u32x c07 = (c >> 7) & 0x0000003c; + const u32x c13 = (c >> 13) & 0x0000060f; + const u32x c20 = (c >> 20) & 0x00000001; - u32 s = BOX (((c00 >> 0) & 0xff), 0, s_skb) - | BOX (((c06 >> 0) & 0xff) - |((c07 >> 0) & 0xff), 1, s_skb) - | BOX (((c13 >> 0) & 0xff) - |((c06 >> 8) & 0xff), 2, s_skb) - | BOX (((c20 >> 0) & 0xff) - |((c13 >> 8) & 0xff) - |((c06 >> 16) & 0xff), 3, s_skb); + u32x s = BOX (((c00 >> 0) & 0xff), 0, s_skb) + | BOX (((c06 >> 0) & 0xff) + |((c07 >> 0) & 0xff), 1, s_skb) + | BOX (((c13 >> 0) & 0xff) + |((c06 >> 8) & 0xff), 2, s_skb) + | BOX (((c20 >> 0) & 0xff) + |((c13 >> 8) & 0xff) + |((c06 >> 16) & 0xff), 3, s_skb); - const u32 d00 = (d >> 0) & 0x00003c3f; - const u32 d07 = (d >> 7) & 0x00003f03; - const u32 d21 = (d >> 21) & 0x0000000f; - const u32 d22 = (d >> 22) & 0x00000030; + const u32x d00 = (d >> 0) & 0x00003c3f; + const u32x d07 = (d >> 7) & 0x00003f03; + const u32x d21 = (d >> 21) & 0x0000000f; + const u32x d22 = (d >> 22) & 0x00000030; - u32 t = BOX (((d00 >> 0) & 0xff), 4, s_skb) - | BOX (((d07 >> 0) & 0xff) - |((d00 >> 8) & 0xff), 5, s_skb) - | BOX (((d07 >> 8) & 0xff), 6, s_skb) - | BOX (((d21 >> 0) & 0xff) - |((d22 >> 0) & 0xff), 7, s_skb); + u32x t = BOX (((d00 >> 0) & 0xff), 4, s_skb) + | BOX (((d07 >> 0) & 0xff) + |((d00 >> 8) & 0xff), 5, s_skb) + | BOX (((d07 >> 8) & 0xff), 6, s_skb) + | BOX (((d21 >> 0) & 0xff) + |((d22 >> 0) & 0xff), 7, s_skb); Kc[i] = ((t << 16) | (s & 0x0000ffff)); Kd[i] = ((s >> 16) | (t & 0xffff0000)); } } -static void transform_netntlmv1_key (const u32 w0, const u32 w1, u32 out[2]) +static void transform_netntlmv1_key (const u32x w0, const u32x w1, u32x out[2]) { - const uchar4 t0 = as_uchar4 (w0); - const uchar4 t1 = as_uchar4 (w1); + u32x t[8]; - uchar4 k0; - uchar4 k1; + t[0] = (w0 >> 0) & 0xff; + t[1] = (w0 >> 8) & 0xff; + t[2] = (w0 >> 16) & 0xff; + t[3] = (w0 >> 24) & 0xff; + t[4] = (w1 >> 0) & 0xff; + t[5] = (w1 >> 8) & 0xff; + t[6] = (w1 >> 16) & 0xff; + t[7] = (w1 >> 24) & 0xff; - k0.s0 = (t0.s0 >> 0); - k0.s1 = (t0.s0 << 7) | (t0.s1 >> 1); - k0.s2 = (t0.s1 << 6) | (t0.s2 >> 2); - k0.s3 = (t0.s2 << 5) | (t0.s3 >> 3); - k1.s0 = (t0.s3 << 4) | (t1.s0 >> 4); - k1.s1 = (t1.s0 << 3) | (t1.s1 >> 5); - k1.s2 = (t1.s1 << 2) | (t1.s2 >> 6); - k1.s3 = (t1.s2 << 1); + u32x k[8]; - out[0] = as_uint (k0); - out[1] = as_uint (k1); + k[0] = (t[0] >> 0); + k[1] = (t[0] << 7) | (t[1] >> 1); + k[2] = (t[1] << 6) | (t[2] >> 2); + k[3] = (t[2] << 5) | (t[3] >> 3); + k[4] = (t[3] << 4) | (t[4] >> 4); + k[5] = (t[4] << 3) | (t[5] >> 5); + k[6] = (t[5] << 2) | (t[6] >> 6); + k[7] = (t[6] << 1); + + out[0] = ((k[0] & 0xff) << 0) + | ((k[1] & 0xff) << 8) + | ((k[2] & 0xff) << 16) + | ((k[3] & 0xff) << 24); + + out[1] = ((k[4] & 0xff) << 0) + | ((k[5] & 0xff) << 8) + | ((k[6] & 0xff) << 16) + | ((k[7] & 0xff) << 24); } -static void m05500m (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m05500m (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -483,27 +505,22 @@ static void m05500m (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 const u32 s1 = salt_bufs[salt_pos].salt_buf[1]; const u32 s2 = salt_bufs[salt_pos].salt_buf[2]; - u32 data[2]; - - data[0] = s0; - data[1] = s1; - /** * loop */ u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - u32 a = MD4M_A; - u32 b = MD4M_B; - u32 c = MD4M_C; - u32 d = MD4M_D; + u32x a = MD4M_A; + u32x b = MD4M_B; + u32x c = MD4M_C; + u32x d = MD4M_D; #define w0_t w0 #define w1_t w[ 1] @@ -571,7 +588,7 @@ static void m05500m (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 MD4_STEP (MD4_H , a, b, c, d, w3_t, MD4C02, MD4S20); MD4_STEP (MD4_H , d, a, b, c, wb_t, MD4C02, MD4S21); - if (allx (s2 != ((d + MD4M_D) >> 16))) continue; + if (MATCHES_NONE_VS (((d + MD4M_D) >> 16), s2)) continue; MD4_STEP (MD4_H , c, d, a, b, w7_t, MD4C02, MD4S22); MD4_STEP (MD4_H , b, c, d, a, wf_t, MD4C02, MD4S23); @@ -585,16 +602,21 @@ static void m05500m (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 * DES1 */ - u32 key[2]; + u32x key[2]; transform_netntlmv1_key (a, b, key); - u32 Kc[16]; - u32 Kd[16]; + u32x Kc[16]; + u32x Kd[16]; _des_crypt_keysetup (key[0], key[1], Kc, Kd, s_skb); - u32 iv1[2]; + u32x data[2]; + + data[0] = s0; + data[1] = s1; + + u32x iv1[2]; _des_crypt_encrypt (iv1, data, Kc, Kd, s_SPtrans); @@ -602,14 +624,14 @@ static void m05500m (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 * DES2 */ - const u32 bc = (b >> 24) | (c << 8); - const u32 cd = (c >> 24) | (d << 8); + const u32x bc = (b >> 24) | (c << 8); + const u32x cd = (c >> 24) | (d << 8); transform_netntlmv1_key (bc, cd, key); _des_crypt_keysetup (key[0], key[1], Kc, Kd, s_skb); - u32 iv2[2]; + u32x iv2[2]; _des_crypt_encrypt (iv2, data, Kc, Kd, s_SPtrans); @@ -617,16 +639,11 @@ static void m05500m (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 * compare */ - const u32 r0 = iv1[0]; - const u32 r1 = iv1[1]; - const u32 r2 = iv2[0]; - const u32 r3 = iv2[1]; - - #include COMPARE_M + COMPARE_M_SIMD (iv1[0], iv1[1], iv2[0], iv2[1]); } } -static void m05500s (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m05500s (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -643,11 +660,6 @@ static void m05500s (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 const u32 s1 = salt_bufs[salt_pos].salt_buf[1]; const u32 s2 = salt_bufs[salt_pos].salt_buf[2]; - u32 data[2]; - - data[0] = s0; - data[1] = s1; - /** * digest */ @@ -666,16 +678,16 @@ static void m05500s (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - u32 a = MD4M_A; - u32 b = MD4M_B; - u32 c = MD4M_C; - u32 d = MD4M_D; + u32x a = MD4M_A; + u32x b = MD4M_B; + u32x c = MD4M_C; + u32x d = MD4M_D; #define w0_t w0 #define w1_t w[ 1] @@ -743,7 +755,7 @@ static void m05500s (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 MD4_STEP (MD4_H , a, b, c, d, w3_t, MD4C02, MD4S20); MD4_STEP (MD4_H , d, a, b, c, wb_t, MD4C02, MD4S21); - if (allx (s2 != ((d + MD4M_D) >> 16))) continue; + if (MATCHES_NONE_VS (((d + MD4M_D) >> 16), s2)) continue; MD4_STEP (MD4_H , c, d, a, b, w7_t, MD4C02, MD4S22); MD4_STEP (MD4_H , b, c, d, a, wf_t, MD4C02, MD4S23); @@ -757,16 +769,21 @@ static void m05500s (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 * DES1 */ - u32 key[2]; + u32x key[2]; transform_netntlmv1_key (a, b, key); - u32 Kc[16]; - u32 Kd[16]; + u32x Kc[16]; + u32x Kd[16]; _des_crypt_keysetup (key[0], key[1], Kc, Kd, s_skb); - u32 iv1[2]; + u32x data[2]; + + data[0] = s0; + data[1] = s1; + + u32x iv1[2]; _des_crypt_encrypt (iv1, data, Kc, Kd, s_SPtrans); @@ -779,12 +796,12 @@ static void m05500s (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 _des_crypt_keysetup (key[0], key[1], Kc, Kd, s_skb); - u32 iv2[2]; + u32x iv2[2]; _des_crypt_encrypt (iv2, data, Kc, Kd, s_SPtrans); */ - u32 iv2[2]; + u32x iv2[2]; iv2[0] = search[2]; iv2[1] = search[3]; @@ -793,16 +810,11 @@ static void m05500s (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 * compare */ - const u32 r0 = iv1[0]; - const u32 r1 = iv1[1]; - const u32 r2 = iv2[0]; - const u32 r3 = iv2[1]; - - #include COMPARE_S + COMPARE_S_SIMD (iv1[0], iv1[1], iv2[0], iv2[1]); } } -__kernel void m05500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m05500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * modifier @@ -876,7 +888,7 @@ __kernel void m05500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m05500m (s_SPtrans, s_skb, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m05500_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m05500_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * modifier @@ -950,11 +962,11 @@ __kernel void m05500_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m05500m (s_SPtrans, s_skb, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m05500_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m05500_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { } -__kernel void m05500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m05500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * modifier @@ -1028,7 +1040,7 @@ __kernel void m05500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m05500s (s_SPtrans, s_skb, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m05500_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m05500_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * modifier @@ -1102,6 +1114,6 @@ __kernel void m05500_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m05500s (s_SPtrans, s_skb, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m05500_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m05500_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { } diff --git a/OpenCL/m05600_a1.cl b/OpenCL/m05600_a1.cl index d4bf22e42..d13327022 100644 --- a/OpenCL/m05600_a1.cl +++ b/OpenCL/m05600_a1.cl @@ -360,7 +360,7 @@ __kernel void m05600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -403,7 +403,7 @@ __kernel void m05600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -682,7 +682,7 @@ __kernel void m05600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -737,7 +737,7 @@ __kernel void m05600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m05600_a3.cl b/OpenCL/m05600_a3.cl index 6b589d6e0..e0d13c733 100644 --- a/OpenCL/m05600_a3.cl +++ b/OpenCL/m05600_a3.cl @@ -5,6 +5,8 @@ #define _NETNTLMV2_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,33 +18,31 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" +#include "OpenCL/simd.c" -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" - -static void md4_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) +static void md4_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[4]) { - u32 a = digest[0]; - u32 b = digest[1]; - u32 c = digest[2]; - u32 d = digest[3]; + u32x a = digest[0]; + u32x b = digest[1]; + u32x c = digest[2]; + u32x d = digest[3]; - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; + u32x w0_t = w0[0]; + u32x w1_t = w0[1]; + u32x w2_t = w0[2]; + u32x w3_t = w0[3]; + u32x w4_t = w1[0]; + u32x w5_t = w1[1]; + u32x w6_t = w1[2]; + u32x w7_t = w1[3]; + u32x w8_t = w2[0]; + u32x w9_t = w2[1]; + u32x wa_t = w2[2]; + u32x wb_t = w2[3]; + u32x wc_t = w3[0]; + u32x wd_t = w3[1]; + u32x we_t = w3[2]; + u32x wf_t = w3[3]; MD4_STEP (MD4_Fo, a, b, c, d, w0_t, MD4C00, MD4S00); MD4_STEP (MD4_Fo, d, a, b, c, w1_t, MD4C00, MD4S01); @@ -101,29 +101,29 @@ static void md4_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], co digest[3] += d; } -static void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) +static void md5_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[4]) { - u32 a = digest[0]; - u32 b = digest[1]; - u32 c = digest[2]; - u32 d = digest[3]; + u32x a = digest[0]; + u32x b = digest[1]; + u32x c = digest[2]; + u32x d = digest[3]; - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; + u32x w0_t = w0[0]; + u32x w1_t = w0[1]; + u32x w2_t = w0[2]; + u32x w3_t = w0[3]; + u32x w4_t = w1[0]; + u32x w5_t = w1[1]; + u32x w6_t = w1[2]; + u32x w7_t = w1[3]; + u32x w8_t = w2[0]; + u32x w9_t = w2[1]; + u32x wa_t = w2[2]; + u32x wb_t = w2[3]; + u32x wc_t = w3[0]; + u32x wd_t = w3[1]; + u32x we_t = w3[2]; + u32x wf_t = w3[3]; MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); @@ -199,7 +199,7 @@ static void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], co digest[3] += d; } -static void hmac_md5_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[4], u32 opad[4]) +static void hmac_md5_pad (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[4], u32x opad[4]) { w0[0] = w0[0] ^ 0x36363636; w0[1] = w0[1] ^ 0x36363636; @@ -250,7 +250,7 @@ static void hmac_md5_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[4 md5_transform (w0, w1, w2, w3, opad); } -static void hmac_md5_run (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[4], u32 opad[4], u32 digest[4]) +static void hmac_md5_run (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[4], u32x opad[4], u32x digest[4]) { digest[0] = ipad[0]; digest[1] = ipad[1]; @@ -309,25 +309,45 @@ static void m05600m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - u32 digest[4]; + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; + + w0_t[0] = w0lr; + w0_t[1] = w0[1]; + w0_t[2] = w0[2]; + w0_t[3] = w0[3]; + + w1_t[0] = w1[0]; + w1_t[1] = w1[1]; + w1_t[2] = w1[2]; + w1_t[3] = w1[3]; + + w2_t[0] = w2[0]; + w2_t[1] = w2[1]; + w2_t[2] = w2[2]; + w2_t[3] = w2[3]; + + w3_t[0] = w3[0]; + w3_t[1] = w3[1]; + w3_t[2] = w3[2]; + w3_t[3] = w3[3]; + + u32x digest[4]; digest[0] = MD4M_A; digest[1] = MD4M_B; digest[2] = MD4M_C; digest[3] = MD4M_D; - md4_transform (w0, w1, w2, w3, digest); - - u32 w0_t[4]; - u32 w1_t[4]; - u32 w2_t[4]; - u32 w3_t[4]; + md4_transform (w0_t, w1_t, w2_t, w3_t, digest); w0_t[0] = digest[0]; w0_t[1] = digest[1]; @@ -351,8 +371,8 @@ static void m05600m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le digest[2] = MD5M_C; digest[3] = MD5M_D; - u32 ipad[4]; - u32 opad[4]; + u32x ipad[4]; + u32x opad[4]; hmac_md5_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); @@ -465,12 +485,7 @@ static void m05600m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le hmac_md5_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); - const u32 r0 = digest[0]; - const u32 r1 = digest[3]; - const u32 r2 = digest[2]; - const u32 r3 = digest[1]; - - #include COMPARE_M + COMPARE_M_SIMD (digest[0], digest[3], digest[2], digest[1]); } } @@ -511,25 +526,45 @@ static void m05600s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - u32 digest[4]; + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; + + w0_t[0] = w0lr; + w0_t[1] = w0[1]; + w0_t[2] = w0[2]; + w0_t[3] = w0[3]; + + w1_t[0] = w1[0]; + w1_t[1] = w1[1]; + w1_t[2] = w1[2]; + w1_t[3] = w1[3]; + + w2_t[0] = w2[0]; + w2_t[1] = w2[1]; + w2_t[2] = w2[2]; + w2_t[3] = w2[3]; + + w3_t[0] = w3[0]; + w3_t[1] = w3[1]; + w3_t[2] = w3[2]; + w3_t[3] = w3[3]; + + u32x digest[4]; digest[0] = MD4M_A; digest[1] = MD4M_B; digest[2] = MD4M_C; digest[3] = MD4M_D; - md4_transform (w0, w1, w2, w3, digest); - - u32 w0_t[4]; - u32 w1_t[4]; - u32 w2_t[4]; - u32 w3_t[4]; + md4_transform (w0_t, w1_t, w2_t, w3_t, digest); w0_t[0] = digest[0]; w0_t[1] = digest[1]; @@ -553,8 +588,8 @@ static void m05600s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le digest[2] = MD5M_C; digest[3] = MD5M_D; - u32 ipad[4]; - u32 opad[4]; + u32x ipad[4]; + u32x opad[4]; hmac_md5_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); @@ -667,12 +702,7 @@ static void m05600s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le hmac_md5_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); - const u32 r0 = digest[0]; - const u32 r1 = digest[3]; - const u32 r2 = digest[2]; - const u32 r3 = digest[1]; - - #include COMPARE_S + COMPARE_S_SIMD (digest[0], digest[3], digest[2], digest[1]); } } diff --git a/OpenCL/m06000_a1.cl b/OpenCL/m06000_a1.cl index dc958dafa..ae9f54108 100644 --- a/OpenCL/m06000_a1.cl +++ b/OpenCL/m06000_a1.cl @@ -267,7 +267,7 @@ __kernel void m06000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -310,7 +310,7 @@ __kernel void m06000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -437,7 +437,7 @@ __kernel void m06000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -492,7 +492,7 @@ __kernel void m06000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m06000_a3.cl b/OpenCL/m06000_a3.cl index a289e90f5..689a7716c 100644 --- a/OpenCL/m06000_a3.cl +++ b/OpenCL/m06000_a3.cl @@ -5,6 +5,8 @@ #define _RIPEMD160_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,17 +18,15 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" +#include "OpenCL/simd.c" -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" - -static void ripemd160_transform (const u32 w[16], u32 dgst[5]) +static void ripemd160_transform (const u32x w[16], u32x dgst[5]) { - u32 a1 = dgst[0]; - u32 b1 = dgst[1]; - u32 c1 = dgst[2]; - u32 d1 = dgst[3]; - u32 e1 = dgst[4]; + u32x a1 = dgst[0]; + u32x b1 = dgst[1]; + u32x c1 = dgst[2]; + u32x d1 = dgst[3]; + u32x e1 = dgst[4]; RIPEMD160_STEP (RIPEMD160_F , a1, b1, c1, d1, e1, w[ 0], RIPEMD160C00, RIPEMD160S00); RIPEMD160_STEP (RIPEMD160_F , e1, a1, b1, c1, d1, w[ 1], RIPEMD160C00, RIPEMD160S01); @@ -113,11 +113,11 @@ static void ripemd160_transform (const u32 w[16], u32 dgst[5]) RIPEMD160_STEP (RIPEMD160_J , c1, d1, e1, a1, b1, w[15], RIPEMD160C40, RIPEMD160S4E); RIPEMD160_STEP (RIPEMD160_J , b1, c1, d1, e1, a1, w[13], RIPEMD160C40, RIPEMD160S4F); - u32 a2 = dgst[0]; - u32 b2 = dgst[1]; - u32 c2 = dgst[2]; - u32 d2 = dgst[3]; - u32 e2 = dgst[4]; + u32x a2 = dgst[0]; + u32x b2 = dgst[1]; + u32x c2 = dgst[2]; + u32x d2 = dgst[3]; + u32x e2 = dgst[4]; RIPEMD160_STEP_WORKAROUND_BUG (RIPEMD160_J , a2, b2, c2, d2, e2, w[ 5], RIPEMD160C50, RIPEMD160S50); RIPEMD160_STEP (RIPEMD160_J , e2, a2, b2, c2, d2, w[14], RIPEMD160C50, RIPEMD160S51); @@ -204,11 +204,11 @@ static void ripemd160_transform (const u32 w[16], u32 dgst[5]) RIPEMD160_STEP (RIPEMD160_F , c2, d2, e2, a2, b2, w[ 9], RIPEMD160C90, RIPEMD160S9E); RIPEMD160_STEP (RIPEMD160_F , b2, c2, d2, e2, a2, w[11], RIPEMD160C90, RIPEMD160S9F); - const u32 a = dgst[1] + c1 + d2; - const u32 b = dgst[2] + d1 + e2; - const u32 c = dgst[3] + e1 + a2; - const u32 d = dgst[4] + a1 + b2; - const u32 e = dgst[0] + b1 + c2; + const u32x a = dgst[1] + c1 + d2; + const u32x b = dgst[2] + d1 + e2; + const u32x c = dgst[3] + e1 + a2; + const u32x d = dgst[4] + a1 + b2; + const u32x e = dgst[0] + b1 + c2; dgst[0] = a; dgst[1] = b; @@ -238,15 +238,15 @@ static void m06000m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - u32 wl[16]; + u32x wl[16]; - wl[ 0] = w0[0]; + wl[ 0] = w0lr; wl[ 1] = w0[1]; wl[ 2] = w0[2]; wl[ 3] = w0[3]; @@ -263,7 +263,7 @@ static void m06000m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le wl[14] = w14; wl[15] = 0; - u32 dgst[5]; + u32x dgst[5]; dgst[0] = RIPEMD160M_A; dgst[1] = RIPEMD160M_B; @@ -273,12 +273,7 @@ static void m06000m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le ripemd160_transform (wl, dgst); - const u32 r0 = dgst[0]; - const u32 r1 = dgst[1]; - const u32 r2 = dgst[2]; - const u32 r3 = dgst[3]; - - #include COMPARE_M + COMPARE_M_SIMD (dgst[0], dgst[1], dgst[2], dgst[3]); } } @@ -315,15 +310,15 @@ static void m06000s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - u32 wl[16]; + u32x wl[16]; - wl[ 0] = w0[0]; + wl[ 0] = w0lr; wl[ 1] = w0[1]; wl[ 2] = w0[2]; wl[ 3] = w0[3]; @@ -340,7 +335,7 @@ static void m06000s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le wl[14] = w14; wl[15] = 0; - u32 dgst[5]; + u32x dgst[5]; dgst[0] = RIPEMD160M_A; dgst[1] = RIPEMD160M_B; @@ -350,12 +345,7 @@ static void m06000s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le ripemd160_transform (wl, dgst); - const u32 r0 = dgst[0]; - const u32 r1 = dgst[1]; - const u32 r2 = dgst[2]; - const u32 r3 = dgst[3]; - - #include COMPARE_S + COMPARE_S_SIMD (dgst[0], dgst[1], dgst[2], dgst[3]); } } diff --git a/OpenCL/m06100_a1.cl b/OpenCL/m06100_a1.cl index d6c3c848d..495d9c725 100644 --- a/OpenCL/m06100_a1.cl +++ b/OpenCL/m06100_a1.cl @@ -1413,7 +1413,7 @@ __kernel void m06100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -1456,7 +1456,7 @@ __kernel void m06100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -1624,7 +1624,7 @@ __kernel void m06100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -1679,7 +1679,7 @@ __kernel void m06100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m06100_a3.cl b/OpenCL/m06100_a3.cl index eb09525f0..ef1face82 100644 --- a/OpenCL/m06100_a3.cl +++ b/OpenCL/m06100_a3.cl @@ -5,6 +5,8 @@ #define _WHIRLPOOL_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,13 +18,19 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" - -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" +#include "OpenCL/simd.c" #define R 10 +#if VECT_SIZE == 1 #define BOX(S,n,i) (S)[(n)][(i)] +#elif VECT_SIZE == 2 +#define BOX(S,n,i) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1]) +#elif VECT_SIZE == 4 +#define BOX(S,n,i) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3]) +#elif VECT_SIZE == 8 +#define BOX(S,n,i) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7]) +#endif __constant u32 Ch[8][256] = { @@ -1120,10 +1128,10 @@ __constant u32 rcl[R + 1] = // this is a highly optimized that assumes dgst[16] = { 0 }; only reuse of no 2nd transform is needed -static void whirlpool_transform (const u32 w[16], u32 dgst[16], __local u32 s_Ch[8][256], __local u32 s_Cl[8][256]) +static void whirlpool_transform (const u32x w[16], u32x dgst[16], __local u32 s_Ch[8][256], __local u32 s_Cl[8][256]) { - u32 Kh[8]; - u32 Kl[8]; + u32x Kh[8]; + u32x Kl[8]; Kh[0] = 0x300beec0; Kl[0] = 0xaf902967; @@ -1142,8 +1150,8 @@ static void whirlpool_transform (const u32 w[16], u32 dgst[16], __local u32 s_Ch Kh[7] = 0x28282828; Kl[7] = 0x28282828; - u32 stateh[8]; - u32 statel[8]; + u32x stateh[8]; + u32x statel[8]; stateh[0] = w[ 0]; statel[0] = w[ 1]; @@ -1162,20 +1170,20 @@ static void whirlpool_transform (const u32 w[16], u32 dgst[16], __local u32 s_Ch stateh[7] = w[14]; statel[7] = w[15]; - u32 Lh[8]; - u32 Ll[8]; + u32x Lh[8]; + u32x Ll[8]; #pragma unroll for (int i = 0; i < 8; i++) { - const u32 Lp0 = stateh[(i + 8) & 7] >> 24; - const u32 Lp1 = stateh[(i + 7) & 7] >> 16; - const u32 Lp2 = stateh[(i + 6) & 7] >> 8; - const u32 Lp3 = stateh[(i + 5) & 7] >> 0; - const u32 Lp4 = statel[(i + 4) & 7] >> 24; - const u32 Lp5 = statel[(i + 3) & 7] >> 16; - const u32 Lp6 = statel[(i + 2) & 7] >> 8; - const u32 Lp7 = statel[(i + 1) & 7] >> 0; + const u32x Lp0 = stateh[(i + 8) & 7] >> 24; + const u32x Lp1 = stateh[(i + 7) & 7] >> 16; + const u32x Lp2 = stateh[(i + 6) & 7] >> 8; + const u32x Lp3 = stateh[(i + 5) & 7] >> 0; + const u32x Lp4 = statel[(i + 4) & 7] >> 24; + const u32x Lp5 = statel[(i + 3) & 7] >> 16; + const u32x Lp6 = statel[(i + 2) & 7] >> 8; + const u32x Lp7 = statel[(i + 1) & 7] >> 0; Lh[i] = BOX (s_Ch, 0, Lp0 & 0xff) ^ BOX (s_Ch, 1, Lp1 & 0xff) @@ -1215,20 +1223,20 @@ static void whirlpool_transform (const u32 w[16], u32 dgst[16], __local u32 s_Ch for (int r = 2; r <= R; r++) { - u32 Lh[8]; - u32 Ll[8]; + u32x Lh[8]; + u32x Ll[8]; #pragma unroll for (int i = 0; i < 8; i++) { - const u32 Lp0 = Kh[(i + 8) & 7] >> 24; - const u32 Lp1 = Kh[(i + 7) & 7] >> 16; - const u32 Lp2 = Kh[(i + 6) & 7] >> 8; - const u32 Lp3 = Kh[(i + 5) & 7] >> 0; - const u32 Lp4 = Kl[(i + 4) & 7] >> 24; - const u32 Lp5 = Kl[(i + 3) & 7] >> 16; - const u32 Lp6 = Kl[(i + 2) & 7] >> 8; - const u32 Lp7 = Kl[(i + 1) & 7] >> 0; + const u32x Lp0 = Kh[(i + 8) & 7] >> 24; + const u32x Lp1 = Kh[(i + 7) & 7] >> 16; + const u32x Lp2 = Kh[(i + 6) & 7] >> 8; + const u32x Lp3 = Kh[(i + 5) & 7] >> 0; + const u32x Lp4 = Kl[(i + 4) & 7] >> 24; + const u32x Lp5 = Kl[(i + 3) & 7] >> 16; + const u32x Lp6 = Kl[(i + 2) & 7] >> 8; + const u32x Lp7 = Kl[(i + 1) & 7] >> 0; Lh[i] = BOX (s_Ch, 0, Lp0 & 0xff) ^ BOX (s_Ch, 1, Lp1 & 0xff) @@ -1269,14 +1277,14 @@ static void whirlpool_transform (const u32 w[16], u32 dgst[16], __local u32 s_Ch #pragma unroll 8 for (int i = 0; i < 8; i++) { - const u32 Lp0 = stateh[(i + 8) & 7] >> 24; - const u32 Lp1 = stateh[(i + 7) & 7] >> 16; - const u32 Lp2 = stateh[(i + 6) & 7] >> 8; - const u32 Lp3 = stateh[(i + 5) & 7] >> 0; - const u32 Lp4 = statel[(i + 4) & 7] >> 24; - const u32 Lp5 = statel[(i + 3) & 7] >> 16; - const u32 Lp6 = statel[(i + 2) & 7] >> 8; - const u32 Lp7 = statel[(i + 1) & 7] >> 0; + const u32x Lp0 = stateh[(i + 8) & 7] >> 24; + const u32x Lp1 = stateh[(i + 7) & 7] >> 16; + const u32x Lp2 = stateh[(i + 6) & 7] >> 8; + const u32x Lp3 = stateh[(i + 5) & 7] >> 0; + const u32x Lp4 = statel[(i + 4) & 7] >> 24; + const u32x Lp5 = statel[(i + 3) & 7] >> 16; + const u32x Lp6 = statel[(i + 2) & 7] >> 8; + const u32x Lp7 = statel[(i + 1) & 7] >> 0; Lh[i] = BOX (s_Ch, 0, Lp0 & 0xff) ^ BOX (s_Ch, 1, Lp1 & 0xff) @@ -1348,15 +1356,15 @@ static void m06100m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - u32 wl[16]; + u32x wl[16]; - wl[ 0] = w0[0]; + wl[ 0] = w0lr; wl[ 1] = w0[1]; wl[ 2] = w0[2]; wl[ 3] = w0[3]; @@ -1373,16 +1381,11 @@ static void m06100m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le wl[14] = 0; wl[15] = pw_len * 8; - u32 dgst[16]; + u32x dgst[16]; whirlpool_transform (wl, dgst, s_Ch, s_Cl); - const u32 r0 = dgst[0]; - const u32 r1 = dgst[1]; - const u32 r2 = dgst[2]; - const u32 r3 = dgst[3]; - - #include COMPARE_M + COMPARE_M_SIMD (dgst[0], dgst[1], dgst[2], dgst[3]); } } @@ -1413,15 +1416,15 @@ static void m06100s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - u32 wl[16]; + u32x wl[16]; - wl[ 0] = w0[0]; + wl[ 0] = w0lr; wl[ 1] = w0[1]; wl[ 2] = w0[2]; wl[ 3] = w0[3]; @@ -1438,16 +1441,11 @@ static void m06100s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le wl[14] = 0; wl[15] = pw_len * 8; - u32 dgst[16]; + u32x dgst[16]; whirlpool_transform (wl, dgst, s_Ch, s_Cl); - const u32 r0 = dgst[0]; - const u32 r1 = dgst[1]; - const u32 r2 = dgst[2]; - const u32 r3 = dgst[3]; - - #include COMPARE_S + COMPARE_S_SIMD (dgst[0], dgst[1], dgst[2], dgst[3]); } } diff --git a/OpenCL/m06900_a1.cl b/OpenCL/m06900_a1.cl index eab51d3d7..eaf03b309 100644 --- a/OpenCL/m06900_a1.cl +++ b/OpenCL/m06900_a1.cl @@ -757,7 +757,7 @@ __kernel void m06900_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -800,7 +800,7 @@ __kernel void m06900_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -1059,7 +1059,7 @@ __kernel void m06900_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -1114,7 +1114,7 @@ __kernel void m06900_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m06900_a3.cl b/OpenCL/m06900_a3.cl index 4bc15f0ec..615eaff8b 100644 --- a/OpenCL/m06900_a3.cl +++ b/OpenCL/m06900_a3.cl @@ -5,6 +5,8 @@ #define _GOST_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,9 +18,7 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" - -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" +#include "OpenCL/simd.c" __constant u32 c_tables[4][256] = { @@ -288,11 +288,19 @@ __constant u32 c_tables[4][256] = } }; +#if VECT_SIZE == 1 #define BOX(i,n,S) (S)[(n)][(i)] +#elif VECT_SIZE == 2 +#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1]) +#elif VECT_SIZE == 4 +#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3]) +#elif VECT_SIZE == 8 +#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7]) +#endif #define round(k1,k2,tbl) \ { \ - u32 t; \ + u32x t; \ t = (k1) + r; \ l ^= BOX (((t >> 0) & 0xff), 0, tbl) ^ \ BOX (((t >> 8) & 0xff), 1, tbl) ^ \ @@ -307,8 +315,8 @@ __constant u32 c_tables[4][256] = #define R(k,h,s,i,t) \ { \ - u32 r; \ - u32 l; \ + u32x r; \ + u32x l; \ r = h[i + 0]; \ l = h[i + 1]; \ round (k[0], k[1], t); \ @@ -377,8 +385,8 @@ __constant u32 c_tables[4][256] = #define A(x) \ { \ - u32 l; \ - u32 r; \ + u32x l; \ + u32x r; \ l = x[0] ^ x[2]; \ r = x[1] ^ x[3]; \ x[0] = x[2]; \ @@ -393,8 +401,8 @@ __constant u32 c_tables[4][256] = #define AA(x) \ { \ - u32 l; \ - u32 r; \ + u32x l; \ + u32x r; \ l = x[0]; \ r = x[2]; \ x[0] = x[4]; \ @@ -652,8 +660,8 @@ __constant u32 c_tables[4][256] = #define PASS0(h,s,u,v,t) \ { \ - u32 k[8]; \ - u32 w[8]; \ + u32x k[8]; \ + u32x w[8]; \ X (w, u, v); \ P (k, w); \ R (k, h, s, 0, t); \ @@ -663,8 +671,8 @@ __constant u32 c_tables[4][256] = #define PASS2(h,s,u,v,t) \ { \ - u32 k[8]; \ - u32 w[8]; \ + u32x k[8]; \ + u32x w[8]; \ X (w, u, v); \ P (k, w); \ R (k, h, s, 2, t); \ @@ -675,8 +683,8 @@ __constant u32 c_tables[4][256] = #define PASS4(h,s,u,v,t) \ { \ - u32 k[8]; \ - u32 w[8]; \ + u32x k[8]; \ + u32x w[8]; \ X (w, u, v); \ P (k, w); \ R (k, h, s, 4, t); \ @@ -686,8 +694,8 @@ __constant u32 c_tables[4][256] = #define PASS6(h,s,u,v,t) \ { \ - u32 k[8]; \ - u32 w[8]; \ + u32x k[8]; \ + u32x w[8]; \ X (w, u, v); \ P (k, w); \ R (k, h, s, 6, t); \ @@ -714,15 +722,15 @@ static void m06900m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - u32 data[8]; + u32x data[8]; - data[0] = w0[0]; + data[0] = w0lr; data[1] = w0[1]; data[2] = w0[2]; data[3] = w0[3]; @@ -731,7 +739,7 @@ static void m06900m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le data[6] = w1[2]; data[7] = w1[3]; - u32 state[16]; + u32x state[16]; state[ 0] = 0; state[ 1] = 0; @@ -750,8 +758,8 @@ static void m06900m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le state[14] = data[6]; state[15] = data[7]; - u32 state_m[8]; - u32 data_m[8]; + u32x state_m[8]; + u32x data_m[8]; /* gost1 */ @@ -773,7 +781,7 @@ static void m06900m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le data_m[6] = data[6]; data_m[7] = data[7]; - u32 tmp[8]; + u32x tmp[8]; if (pw_len > 0) { @@ -865,14 +873,10 @@ static void m06900m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le /* store */ - const u32 r0 = state[0]; - const u32 r1 = state[1]; - const u32 r2 = state[2]; - const u32 r3 = state[3]; - - #include COMPARE_M + COMPARE_M_SIMD (state[0], state[1], state[2], state[3]); } } + static void m06900s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 s_tables[4][256]) { /** @@ -906,15 +910,15 @@ static void m06900s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - u32 data[8]; + u32x data[8]; - data[0] = w0[0]; + data[0] = w0lr; data[1] = w0[1]; data[2] = w0[2]; data[3] = w0[3]; @@ -923,7 +927,7 @@ static void m06900s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le data[6] = w1[2]; data[7] = w1[3]; - u32 state[16]; + u32x state[16]; state[ 0] = 0; state[ 1] = 0; @@ -942,8 +946,8 @@ static void m06900s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le state[14] = data[6]; state[15] = data[7]; - u32 state_m[8]; - u32 data_m[8]; + u32x state_m[8]; + u32x data_m[8]; /* gost1 */ @@ -965,7 +969,7 @@ static void m06900s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le data_m[6] = data[6]; data_m[7] = data[7]; - u32 tmp[8]; + u32x tmp[8]; if (pw_len > 0) { @@ -1057,12 +1061,7 @@ static void m06900s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le /* store */ - const u32 r0 = state[0]; - const u32 r1 = state[1]; - const u32 r2 = state[2]; - const u32 r3 = state[3]; - - #include COMPARE_S + COMPARE_S_SIMD (state[0], state[1], state[2], state[3]); } } diff --git a/OpenCL/m07300_a1.cl b/OpenCL/m07300_a1.cl index 2e6ab9641..5032ac58c 100644 --- a/OpenCL/m07300_a1.cl +++ b/OpenCL/m07300_a1.cl @@ -285,7 +285,7 @@ __kernel void m07300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -334,7 +334,7 @@ __kernel void m07300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -515,7 +515,7 @@ __kernel void m07300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -576,7 +576,7 @@ __kernel void m07300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m07300_a3.cl b/OpenCL/m07300_a3.cl index ca53d5393..bea9dbe09 100644 --- a/OpenCL/m07300_a3.cl +++ b/OpenCL/m07300_a3.cl @@ -5,6 +5,8 @@ #define _SHA1_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,34 +18,32 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" +#include "OpenCL/simd.c" -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" - -static void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) +static void sha1_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[5]) { - u32 A = digest[0]; - u32 B = digest[1]; - u32 C = digest[2]; - u32 D = digest[3]; - u32 E = digest[4]; + u32x A = digest[0]; + u32x B = digest[1]; + u32x C = digest[2]; + u32x D = digest[3]; + u32x E = digest[4]; - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; + u32x w0_t = w0[0]; + u32x w1_t = w0[1]; + u32x w2_t = w0[2]; + u32x w3_t = w0[3]; + u32x w4_t = w1[0]; + u32x w5_t = w1[1]; + u32x w6_t = w1[2]; + u32x w7_t = w1[3]; + u32x w8_t = w2[0]; + u32x w9_t = w2[1]; + u32x wa_t = w2[2]; + u32x wb_t = w2[3]; + u32x wc_t = w3[0]; + u32x wd_t = w3[1]; + u32x we_t = w3[2]; + u32x wf_t = w3[3]; #undef K #define K SHA1C00 @@ -148,7 +148,7 @@ static void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], c digest[4] += E; } -static void hmac_sha1_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[5], u32 opad[5]) +static void hmac_sha1_pad (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[5], u32x opad[5]) { w0[0] = w0[0] ^ 0x36363636; w0[1] = w0[1] ^ 0x36363636; @@ -201,7 +201,7 @@ static void hmac_sha1_pad (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[ sha1_transform (w0, w1, w2, w3, opad); } -static void hmac_sha1_run (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 ipad[5], u32 opad[5], u32 digest[5]) +static void hmac_sha1_run (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x ipad[5], u32x opad[5], u32x digest[5]) { digest[0] = ipad[0]; digest[1] = ipad[1]; @@ -254,46 +254,46 @@ static void m07300m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; /** * pads */ - u32 w0_t[4]; + u32x w0_t[4]; - w0_t[0] = w0[0]; + w0_t[0] = w0lr; w0_t[1] = w0[1]; w0_t[2] = w0[2]; w0_t[3] = w0[3]; - u32 w1_t[4]; + u32x w1_t[4]; w1_t[0] = w1[0]; w1_t[1] = w1[1]; w1_t[2] = w1[2]; w1_t[3] = w1[3]; - u32 w2_t[4]; + u32x w2_t[4]; w2_t[0] = w2[0]; w2_t[1] = w2[1]; w2_t[2] = w2[2]; w2_t[3] = w2[3]; - u32 w3_t[4]; + u32x w3_t[4]; w3_t[0] = w3[0]; w3_t[1] = w3[1]; w3_t[2] = 0; w3_t[3] = 0; - u32 ipad[5]; - u32 opad[5]; + u32x ipad[5]; + u32x opad[5]; hmac_sha1_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); @@ -341,16 +341,11 @@ static void m07300m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w3_t[2] = 0; w3_t[3] = (64 + esalt_size) * 8; - u32 digest[5]; + u32x digest[5]; hmac_sha1_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); - const u32 r0 = digest[3]; - const u32 r1 = digest[4]; - const u32 r2 = digest[2]; - const u32 r3 = digest[1]; - - #include COMPARE_M + COMPARE_M_SIMD (digest[3], digest[4], digest[2], digest[1]); } } @@ -383,46 +378,46 @@ static void m07300s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; /** * pads */ - u32 w0_t[4]; + u32x w0_t[4]; - w0_t[0] = w0[0]; + w0_t[0] = w0lr; w0_t[1] = w0[1]; w0_t[2] = w0[2]; w0_t[3] = w0[3]; - u32 w1_t[4]; + u32x w1_t[4]; w1_t[0] = w1[0]; w1_t[1] = w1[1]; w1_t[2] = w1[2]; w1_t[3] = w1[3]; - u32 w2_t[4]; + u32x w2_t[4]; w2_t[0] = w2[0]; w2_t[1] = w2[1]; w2_t[2] = w2[2]; w2_t[3] = w2[3]; - u32 w3_t[4]; + u32x w3_t[4]; w3_t[0] = w3[0]; w3_t[1] = w3[1]; w3_t[2] = 0; w3_t[3] = 0; - u32 ipad[5]; - u32 opad[5]; + u32x ipad[5]; + u32x opad[5]; hmac_sha1_pad (w0_t, w1_t, w2_t, w3_t, ipad, opad); @@ -470,16 +465,11 @@ static void m07300s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w3_t[2] = 0; w3_t[3] = (64 + esalt_size) * 8; - u32 digest[5]; + u32x digest[5]; hmac_sha1_run (w0_t, w1_t, w2_t, w3_t, ipad, opad, digest); - const u32 r0 = digest[3]; - const u32 r1 = digest[4]; - const u32 r2 = digest[2]; - const u32 r3 = digest[1]; - - #include COMPARE_S + COMPARE_S_SIMD (digest[3], digest[4], digest[2], digest[1]); } } diff --git a/OpenCL/m07500_a1.cl b/OpenCL/m07500_a1.cl index 648e3be35..b9b74c3c2 100644 --- a/OpenCL/m07500_a1.cl +++ b/OpenCL/m07500_a1.cl @@ -591,7 +591,7 @@ __kernel void m07500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -656,7 +656,7 @@ __kernel void m07500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -768,7 +768,7 @@ __kernel void m07500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -833,7 +833,7 @@ __kernel void m07500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m07600_a1.cl b/OpenCL/m07600_a1.cl index e86a2e112..62db3eeed 100644 --- a/OpenCL/m07600_a1.cl +++ b/OpenCL/m07600_a1.cl @@ -89,7 +89,7 @@ __kernel void m07600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -170,7 +170,7 @@ __kernel void m07600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordr0, wordr1, pw_r_len); - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -722,7 +722,7 @@ __kernel void m07600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -821,7 +821,7 @@ __kernel void m07600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordr0, wordr1, pw_r_len); - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m07600_a3.cl b/OpenCL/m07600_a3.cl index 3b9846eae..b145718bd 100644 --- a/OpenCL/m07600_a3.cl +++ b/OpenCL/m07600_a3.cl @@ -5,6 +5,8 @@ #define _SHA1_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,11 +18,17 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" +#include "OpenCL/simd.c" -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" - -#define uint_to_hex_lower8(i) l_bin2asc[(i)] +#if VECT_SIZE == 1 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#endif static void m07600m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 l_bin2asc[256]) { @@ -71,38 +79,38 @@ static void m07600m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; /** * sha1 */ - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = 0; - u32 wf_t = pw_len * 8; + u32x w0_t = w0lr; + u32x w1_t = w0[1]; + u32x w2_t = w0[2]; + u32x w3_t = w0[3]; + u32x w4_t = w1[0]; + u32x w5_t = w1[1]; + u32x w6_t = w1[2]; + u32x w7_t = w1[3]; + u32x w8_t = w2[0]; + u32x w9_t = w2[1]; + u32x wa_t = w2[2]; + u32x wb_t = w2[3]; + u32x wc_t = w3[0]; + u32x wd_t = w3[1]; + u32x we_t = 0; + u32x wf_t = pw_len * 8; - u32 a = SHA1M_A; - u32 b = SHA1M_B; - u32 c = SHA1M_C; - u32 d = SHA1M_D; - u32 e = SHA1M_E; + u32x a = SHA1M_A; + u32x b = SHA1M_B; + u32x c = SHA1M_C; + u32x d = SHA1M_D; + u32x e = SHA1M_E; #undef K #define K SHA1C00 @@ -210,7 +218,7 @@ static void m07600m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * Prepend salt */ - u32 w0t[4]; + u32x w0t[4]; w0t[0] = uint_to_hex_lower8 ((a >> 24) & 255) << 0 | uint_to_hex_lower8 ((a >> 16) & 255) << 16; @@ -221,7 +229,7 @@ static void m07600m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w0t[3] = uint_to_hex_lower8 ((b >> 8) & 255) << 0 | uint_to_hex_lower8 ((b >> 0) & 255) << 16; - u32 w1t[4]; + u32x w1t[4]; w1t[0] = uint_to_hex_lower8 ((c >> 24) & 255) << 0 | uint_to_hex_lower8 ((c >> 16) & 255) << 16; @@ -232,7 +240,7 @@ static void m07600m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w1t[3] = uint_to_hex_lower8 ((d >> 8) & 255) << 0 | uint_to_hex_lower8 ((d >> 0) & 255) << 16; - u32 w2t[2]; + u32x w2t[2]; w2t[0] = uint_to_hex_lower8 ((e >> 24) & 255) << 0 | uint_to_hex_lower8 ((e >> 16) & 255) << 16; @@ -387,11 +395,11 @@ static void m07600m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le d += SHA1M_D; e += SHA1M_E; - u32 r_a = a; - u32 r_b = b; - u32 r_c = c; - u32 r_d = d; - u32 r_e = e; + u32x r_a = a; + u32x r_b = b; + u32x r_c = c; + u32x r_d = d; + u32x r_e = e; // 2nd transform @@ -514,12 +522,7 @@ static void m07600m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le d += r_d; e += r_e; - const u32 r0 = d; - const u32 r1 = e; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_M + COMPARE_M_SIMD (d, e, c, b); } } @@ -584,38 +587,38 @@ static void m07600s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; /** * sha1 */ - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = 0; - u32 wf_t = pw_len * 8; + u32x w0_t = w0lr; + u32x w1_t = w0[1]; + u32x w2_t = w0[2]; + u32x w3_t = w0[3]; + u32x w4_t = w1[0]; + u32x w5_t = w1[1]; + u32x w6_t = w1[2]; + u32x w7_t = w1[3]; + u32x w8_t = w2[0]; + u32x w9_t = w2[1]; + u32x wa_t = w2[2]; + u32x wb_t = w2[3]; + u32x wc_t = w3[0]; + u32x wd_t = w3[1]; + u32x we_t = 0; + u32x wf_t = pw_len * 8; - u32 a = SHA1M_A; - u32 b = SHA1M_B; - u32 c = SHA1M_C; - u32 d = SHA1M_D; - u32 e = SHA1M_E; + u32x a = SHA1M_A; + u32x b = SHA1M_B; + u32x c = SHA1M_C; + u32x d = SHA1M_D; + u32x e = SHA1M_E; #undef K #define K SHA1C00 @@ -723,7 +726,7 @@ static void m07600s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * Prepend salt */ - u32 w0t[4]; + u32x w0t[4]; w0t[0] = uint_to_hex_lower8 ((a >> 24) & 255) << 0 | uint_to_hex_lower8 ((a >> 16) & 255) << 16; @@ -734,7 +737,7 @@ static void m07600s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w0t[3] = uint_to_hex_lower8 ((b >> 8) & 255) << 0 | uint_to_hex_lower8 ((b >> 0) & 255) << 16; - u32 w1t[4]; + u32x w1t[4]; w1t[0] = uint_to_hex_lower8 ((c >> 24) & 255) << 0 | uint_to_hex_lower8 ((c >> 16) & 255) << 16; @@ -745,7 +748,7 @@ static void m07600s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w1t[3] = uint_to_hex_lower8 ((d >> 8) & 255) << 0 | uint_to_hex_lower8 ((d >> 0) & 255) << 16; - u32 w2t[2]; + u32x w2t[2]; w2t[0] = uint_to_hex_lower8 ((e >> 24) & 255) << 0 | uint_to_hex_lower8 ((e >> 16) & 255) << 16; @@ -900,11 +903,11 @@ static void m07600s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le d += SHA1M_D; e += SHA1M_E; - u32 r_a = a; - u32 r_b = b; - u32 r_c = c; - u32 r_d = d; - u32 r_e = e; + u32x r_a = a; + u32x r_b = b; + u32x r_c = c; + u32x r_d = d; + u32x r_e = e; // 2nd transform @@ -1027,12 +1030,7 @@ static void m07600s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le d += r_d; e += r_e; - const u32 r0 = d; - const u32 r1 = e; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_S + COMPARE_S_SIMD (d, e, c, b); } } diff --git a/OpenCL/m07700_a0.cl b/OpenCL/m07700_a0.cl index 49a50cf30..149bbcdd5 100644 --- a/OpenCL/m07700_a0.cl +++ b/OpenCL/m07700_a0.cl @@ -340,7 +340,7 @@ __kernel void m07700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, out_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, out_len); const u32 pw_salt_len = out_len + salt_len; @@ -685,7 +685,7 @@ __kernel void m07700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, out_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, out_len); const u32 pw_salt_len = out_len + salt_len; diff --git a/OpenCL/m07700_a1.cl b/OpenCL/m07700_a1.cl index 9a8b2e5e3..7d5fe0594 100644 --- a/OpenCL/m07700_a1.cl +++ b/OpenCL/m07700_a1.cl @@ -264,7 +264,7 @@ __kernel void m07700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -328,7 +328,7 @@ __kernel void m07700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -370,7 +370,7 @@ __kernel void m07700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, pw_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len); const u32 pw_salt_len = pw_len + salt_len; @@ -629,7 +629,7 @@ __kernel void m07700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -701,7 +701,7 @@ __kernel void m07700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -743,7 +743,7 @@ __kernel void m07700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, pw_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len); const u32 pw_salt_len = pw_len + salt_len; diff --git a/OpenCL/m07700_a3.cl b/OpenCL/m07700_a3.cl index f367473cb..0bc3b13f9 100644 --- a/OpenCL/m07700_a3.cl +++ b/OpenCL/m07700_a3.cl @@ -272,7 +272,7 @@ static void m07700m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, pw_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len); const u32 pw_salt_len = pw_len + salt_len; @@ -543,7 +543,7 @@ static void m07700s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, pw_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len); const u32 pw_salt_len = pw_len + salt_len; diff --git a/OpenCL/m07800_a0.cl b/OpenCL/m07800_a0.cl index 07138d647..451af6463 100644 --- a/OpenCL/m07800_a0.cl +++ b/OpenCL/m07800_a0.cl @@ -309,7 +309,7 @@ __kernel void m07800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, out_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, out_len); const u32 pw_salt_len = out_len + salt_len; @@ -589,7 +589,7 @@ __kernel void m07800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, out_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, out_len); const u32 pw_salt_len = out_len + salt_len; diff --git a/OpenCL/m07800_a1.cl b/OpenCL/m07800_a1.cl index 679b255ad..b7aaa744c 100644 --- a/OpenCL/m07800_a1.cl +++ b/OpenCL/m07800_a1.cl @@ -238,7 +238,7 @@ __kernel void m07800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -298,7 +298,7 @@ __kernel void m07800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } /** @@ -333,7 +333,7 @@ __kernel void m07800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, pw_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len); const u32 pw_salt_len = pw_len + salt_len; @@ -560,7 +560,7 @@ __kernel void m07800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } const u32 search[4] = @@ -628,7 +628,7 @@ __kernel void m07800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } /** @@ -663,7 +663,7 @@ __kernel void m07800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, pw_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len); const u32 pw_salt_len = pw_len + salt_len; diff --git a/OpenCL/m07800_a3.cl b/OpenCL/m07800_a3.cl index 20d5fd409..ad5144a61 100644 --- a/OpenCL/m07800_a3.cl +++ b/OpenCL/m07800_a3.cl @@ -254,7 +254,7 @@ static void m07800m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, pw_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len); const u32 pw_salt_len = pw_len + salt_len; @@ -473,7 +473,7 @@ static void m07800s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, pw_len); + switch_buffer_by_offset_le (s0, s1, s2, s3, pw_len); const u32 pw_salt_len = pw_len + salt_len; diff --git a/OpenCL/m08000_a1.cl b/OpenCL/m08000_a1.cl index 536b60e07..aad202ddb 100644 --- a/OpenCL/m08000_a1.cl +++ b/OpenCL/m08000_a1.cl @@ -319,7 +319,7 @@ __kernel void m08000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -352,7 +352,7 @@ __kernel void m08000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -555,7 +555,7 @@ __kernel void m08000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -600,7 +600,7 @@ __kernel void m08000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m08000_a3.cl b/OpenCL/m08000_a3.cl index a5eb3fc75..8fc05730a 100644 --- a/OpenCL/m08000_a3.cl +++ b/OpenCL/m08000_a3.cl @@ -5,6 +5,8 @@ #define _SHA256_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,9 +18,7 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" - -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" +#include "OpenCL/simd.c" __constant u32 k_sha256[64] = { @@ -40,33 +40,38 @@ __constant u32 k_sha256[64] = SHA256C3c, SHA256C3d, SHA256C3e, SHA256C3f, }; -static void sha256_transform (u32 digest[8], const u32 w[16]) -{ - u32 a = digest[0]; - u32 b = digest[1]; - u32 c = digest[2]; - u32 d = digest[3]; - u32 e = digest[4]; - u32 f = digest[5]; - u32 g = digest[6]; - u32 h = digest[7]; +#define SHA256_S0_S(x) (rotl32_S ((x), 25u) ^ rotl32_S ((x), 14u) ^ SHIFT_RIGHT_32 ((x), 3u)) +#define SHA256_S1_S(x) (rotl32_S ((x), 15u) ^ rotl32_S ((x), 13u) ^ SHIFT_RIGHT_32 ((x), 10u)) - u32 w0_t = w[ 0]; - u32 w1_t = w[ 1]; - u32 w2_t = w[ 2]; - u32 w3_t = w[ 3]; - u32 w4_t = w[ 4]; - u32 w5_t = w[ 5]; - u32 w6_t = w[ 6]; - u32 w7_t = w[ 7]; - u32 w8_t = w[ 8]; - u32 w9_t = w[ 9]; - u32 wa_t = w[10]; - u32 wb_t = w[11]; - u32 wc_t = w[12]; - u32 wd_t = w[13]; - u32 we_t = w[14]; - u32 wf_t = w[15]; +#define SHA256_EXPAND_S(x,y,z,w) (SHA256_S1_S (x) + y + SHA256_S0_S (z) + w) + +static void sha256_transform (u32x digest[8], const u32x w[16]) +{ + u32x a = digest[0]; + u32x b = digest[1]; + u32x c = digest[2]; + u32x d = digest[3]; + u32x e = digest[4]; + u32x f = digest[5]; + u32x g = digest[6]; + u32x h = digest[7]; + + u32x w0_t = w[ 0]; + u32x w1_t = w[ 1]; + u32x w2_t = w[ 2]; + u32x w3_t = w[ 3]; + u32x w4_t = w[ 4]; + u32x w5_t = w[ 5]; + u32x w6_t = w[ 6]; + u32x w7_t = w[ 7]; + u32x w8_t = w[ 8]; + u32x w9_t = w[ 9]; + u32x wa_t = w[10]; + u32x wb_t = w[11]; + u32x wc_t = w[12]; + u32x wd_t = w[13]; + u32x we_t = w[14]; + u32x wf_t = w[15]; #define ROUND_EXPAND() \ { \ @@ -126,16 +131,16 @@ static void sha256_transform (u32 digest[8], const u32 w[16]) digest[7] += h; } -static void sha256_transform_z (u32 digest[8]) +static void sha256_transform_z (u32x digest[8]) { - u32 a = digest[0]; - u32 b = digest[1]; - u32 c = digest[2]; - u32 d = digest[3]; - u32 e = digest[4]; - u32 f = digest[5]; - u32 g = digest[6]; - u32 h = digest[7]; + u32x a = digest[0]; + u32x b = digest[1]; + u32x c = digest[2]; + u32x d = digest[3]; + u32x e = digest[4]; + u32x f = digest[5]; + u32x g = digest[6]; + u32x h = digest[7]; #define ROUND_STEP_Z(i) \ { \ @@ -175,16 +180,16 @@ static void sha256_transform_z (u32 digest[8]) digest[7] += h; } -static void sha256_transform_s (u32 digest[8], __local u32 w[64]) +static void sha256_transform_s (u32x digest[8], __local u32 w[64]) { - u32 a = digest[0]; - u32 b = digest[1]; - u32 c = digest[2]; - u32 d = digest[3]; - u32 e = digest[4]; - u32 f = digest[5]; - u32 g = digest[6]; - u32 h = digest[7]; + u32x a = digest[0]; + u32x b = digest[1]; + u32x c = digest[2]; + u32x d = digest[3]; + u32x e = digest[4]; + u32x f = digest[5]; + u32x g = digest[6]; + u32x h = digest[7]; #define ROUND_STEP_S(i) \ { \ @@ -224,7 +229,7 @@ static void sha256_transform_s (u32 digest[8], __local u32 w[64]) digest[7] += h; } -static void m08000m (__local u32 w_s1[64], __local u32 w_s2[64], u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 gid_max) +static void m08000m (__local u32 w_s1[64], __local u32 w_s2[64], u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 gid_max) { /** * modifier @@ -238,9 +243,9 @@ static void m08000m (__local u32 w_s1[64], __local u32 w_s2[64], u32 w[16], cons * salt */ - const u32 salt_buf0 = swap32 (salt_bufs[salt_pos].salt_buf[ 0]); - const u32 salt_buf1 = swap32 (salt_bufs[salt_pos].salt_buf[ 1]); - const u32 salt_buf2 = swap32 (salt_bufs[salt_pos].salt_buf[ 2]); // 0x80 + const u32 salt_buf0 = swap32_S (salt_bufs[salt_pos].salt_buf[ 0]); + const u32 salt_buf1 = swap32_S (salt_bufs[salt_pos].salt_buf[ 1]); + const u32 salt_buf2 = swap32_S (salt_bufs[salt_pos].salt_buf[ 2]); // 0x80 /** * precompute final msg blocks @@ -261,7 +266,7 @@ static void m08000m (__local u32 w_s1[64], __local u32 w_s2[64], u32 w[16], cons #pragma unroll for (int i = 16; i < 64; i++) { - w_s1[i] = SHA256_EXPAND (w_s1[i - 2], w_s1[i - 7], w_s1[i - 15], w_s1[i - 16]); + w_s1[i] = SHA256_EXPAND_S (w_s1[i - 2], w_s1[i - 7], w_s1[i - 15], w_s1[i - 16]); } w_s2[ 0] = salt_buf0 << 16 | salt_buf1 >> 16; @@ -272,7 +277,7 @@ static void m08000m (__local u32 w_s1[64], __local u32 w_s2[64], u32 w[16], cons #pragma unroll for (int i = 16; i < 64; i++) { - w_s2[i] = SHA256_EXPAND (w_s2[i - 2], w_s2[i - 7], w_s2[i - 15], w_s2[i - 16]); + w_s2[i] = SHA256_EXPAND_S (w_s2[i - 2], w_s2[i - 7], w_s2[i - 15], w_s2[i - 16]); } } @@ -284,21 +289,24 @@ static void m08000m (__local u32 w_s1[64], __local u32 w_s2[64], u32 w[16], cons * modifier */ - w[ 1] = w[ 1] >> 8; - w[ 2] = w[ 2] >> 8; - w[ 3] = w[ 3] >> 8; - w[ 4] = w[ 4] >> 8; - w[ 5] = w[ 5] >> 8; - w[ 6] = w[ 6] >> 8; - w[ 7] = w[ 7] >> 8; - w[ 8] = w[ 8] >> 8; - w[ 9] = w[ 9] >> 8; - w[10] = w[10] >> 8; - w[11] = w[11] >> 8; - w[12] = w[12] >> 8; - w[13] = w[13] >> 8; - w[14] = w[14] >> 8; - w[15] = w[15] >> 8; + u32x w_t[16]; + + w_t[ 0] = w[ 0] >> 8; + w_t[ 1] = w[ 1] >> 8; + w_t[ 2] = w[ 2] >> 8; + w_t[ 3] = w[ 3] >> 8; + w_t[ 4] = w[ 4] >> 8; + w_t[ 5] = w[ 5] >> 8; + w_t[ 6] = w[ 6] >> 8; + w_t[ 7] = w[ 7] >> 8; + w_t[ 8] = w[ 8] >> 8; + w_t[ 9] = w[ 9] >> 8; + w_t[10] = w[10] >> 8; + w_t[11] = w[11] >> 8; + w_t[12] = w[12] >> 8; + w_t[13] = w[13] >> 8; + w_t[14] = w[14] >> 8; + w_t[15] = w[15] >> 8; /** * loop @@ -306,15 +314,15 @@ static void m08000m (__local u32 w_s1[64], __local u32 w_s2[64], u32 w[16], cons u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0lr = w0l | w0r; - w[0] = w0 >> 8; + w_t[0] = w0lr >> 8; - u32 digest[8]; + u32x digest[8]; digest[0] = SHA256M_A; digest[1] = SHA256M_B; @@ -325,7 +333,7 @@ static void m08000m (__local u32 w_s1[64], __local u32 w_s2[64], u32 w[16], cons digest[6] = SHA256M_G; digest[7] = SHA256M_H; - sha256_transform (digest, w); // 0 - 64 + sha256_transform (digest, w_t); // 0 - 64 sha256_transform_z (digest); // 64 - 128 sha256_transform_z (digest); // 128 - 192 sha256_transform_z (digest); // 192 - 256 @@ -335,16 +343,11 @@ static void m08000m (__local u32 w_s1[64], __local u32 w_s2[64], u32 w[16], cons sha256_transform_s (digest, w_s1); // 448 - 512 sha256_transform_s (digest, w_s2); // 512 - 576 - const u32 r0 = digest[3]; - const u32 r1 = digest[7]; - const u32 r2 = digest[2]; - const u32 r3 = digest[6]; - - #include COMPARE_M + COMPARE_M_SIMD (digest[3], digest[7], digest[2], digest[6]); } } -static void m08000s (__local u32 w_s1[64], __local u32 w_s2[64], u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 gid_max) +static void m08000s (__local u32 w_s1[64], __local u32 w_s2[64], u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 gid_max) { /** * modifier @@ -358,9 +361,9 @@ static void m08000s (__local u32 w_s1[64], __local u32 w_s2[64], u32 w[16], cons * salt */ - const u32 salt_buf0 = swap32 (salt_bufs[salt_pos].salt_buf[ 0]); - const u32 salt_buf1 = swap32 (salt_bufs[salt_pos].salt_buf[ 1]); - const u32 salt_buf2 = swap32 (salt_bufs[salt_pos].salt_buf[ 2]); // 0x80 + const u32 salt_buf0 = swap32_S (salt_bufs[salt_pos].salt_buf[ 0]); + const u32 salt_buf1 = swap32_S (salt_bufs[salt_pos].salt_buf[ 1]); + const u32 salt_buf2 = swap32_S (salt_bufs[salt_pos].salt_buf[ 2]); // 0x80 /** * precompute final msg blocks @@ -381,7 +384,7 @@ static void m08000s (__local u32 w_s1[64], __local u32 w_s2[64], u32 w[16], cons #pragma unroll for (int i = 16; i < 64; i++) { - w_s1[i] = SHA256_EXPAND (w_s1[i - 2], w_s1[i - 7], w_s1[i - 15], w_s1[i - 16]); + w_s1[i] = SHA256_EXPAND_S (w_s1[i - 2], w_s1[i - 7], w_s1[i - 15], w_s1[i - 16]); } w_s2[ 0] = salt_buf0 << 16 | salt_buf1 >> 16; @@ -392,7 +395,7 @@ static void m08000s (__local u32 w_s1[64], __local u32 w_s2[64], u32 w[16], cons #pragma unroll for (int i = 16; i < 64; i++) { - w_s2[i] = SHA256_EXPAND (w_s2[i - 2], w_s2[i - 7], w_s2[i - 15], w_s2[i - 16]); + w_s2[i] = SHA256_EXPAND_S (w_s2[i - 2], w_s2[i - 7], w_s2[i - 15], w_s2[i - 16]); } } @@ -404,21 +407,24 @@ static void m08000s (__local u32 w_s1[64], __local u32 w_s2[64], u32 w[16], cons * modifier */ - w[ 1] = w[ 1] >> 8; - w[ 2] = w[ 2] >> 8; - w[ 3] = w[ 3] >> 8; - w[ 4] = w[ 4] >> 8; - w[ 5] = w[ 5] >> 8; - w[ 6] = w[ 6] >> 8; - w[ 7] = w[ 7] >> 8; - w[ 8] = w[ 8] >> 8; - w[ 9] = w[ 9] >> 8; - w[10] = w[10] >> 8; - w[11] = w[11] >> 8; - w[12] = w[12] >> 8; - w[13] = w[13] >> 8; - w[14] = w[14] >> 8; - w[15] = w[15] >> 8; + u32x w_t[16]; + + w_t[ 0] = w[ 0] >> 8; + w_t[ 1] = w[ 1] >> 8; + w_t[ 2] = w[ 2] >> 8; + w_t[ 3] = w[ 3] >> 8; + w_t[ 4] = w[ 4] >> 8; + w_t[ 5] = w[ 5] >> 8; + w_t[ 6] = w[ 6] >> 8; + w_t[ 7] = w[ 7] >> 8; + w_t[ 8] = w[ 8] >> 8; + w_t[ 9] = w[ 9] >> 8; + w_t[10] = w[10] >> 8; + w_t[11] = w[11] >> 8; + w_t[12] = w[12] >> 8; + w_t[13] = w[13] >> 8; + w_t[14] = w[14] >> 8; + w_t[15] = w[15] >> 8; /** * digest @@ -438,15 +444,15 @@ static void m08000s (__local u32 w_s1[64], __local u32 w_s2[64], u32 w[16], cons u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0lr = w0l | w0r; - w[0] = w0 >> 8; + w_t[0] = w0lr >> 8; - u32 digest[8]; + u32x digest[8]; digest[0] = SHA256M_A; digest[1] = SHA256M_B; @@ -457,7 +463,7 @@ static void m08000s (__local u32 w_s1[64], __local u32 w_s2[64], u32 w[16], cons digest[6] = SHA256M_G; digest[7] = SHA256M_H; - sha256_transform (digest, w); // 0 - 64 + sha256_transform (digest, w_t); // 0 - 64 sha256_transform_z (digest); // 64 - 128 sha256_transform_z (digest); // 128 - 192 sha256_transform_z (digest); // 192 - 256 @@ -467,16 +473,11 @@ static void m08000s (__local u32 w_s1[64], __local u32 w_s2[64], u32 w[16], cons sha256_transform_s (digest, w_s1); // 448 - 512 sha256_transform_s (digest, w_s2); // 512 - 576 - const u32 r0 = digest[3]; - const u32 r1 = digest[7]; - const u32 r2 = digest[2]; - const u32 r3 = digest[6]; - - #include COMPARE_S + COMPARE_S_SIMD (digest[3], digest[7], digest[2], digest[6]); } } -__kernel void m08000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -515,7 +516,7 @@ __kernel void m08000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m08000m (w_s1, w_s2, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset, gid_max); } -__kernel void m08000_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08000_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -554,7 +555,7 @@ __kernel void m08000_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m08000m (w_s1, w_s2, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset, gid_max); } -__kernel void m08000_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08000_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -593,7 +594,7 @@ __kernel void m08000_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m08000m (w_s1, w_s2, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset, gid_max); } -__kernel void m08000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -632,7 +633,7 @@ __kernel void m08000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m08000s (w_s1, w_s2, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset, gid_max); } -__kernel void m08000_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08000_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -671,7 +672,7 @@ __kernel void m08000_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m08000s (w_s1, w_s2, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset, gid_max); } -__kernel void m08000_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08000_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base diff --git a/OpenCL/m08100_a1.cl b/OpenCL/m08100_a1.cl index 75679b20d..325c08b28 100644 --- a/OpenCL/m08100_a1.cl +++ b/OpenCL/m08100_a1.cl @@ -68,7 +68,7 @@ __kernel void m08100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -116,7 +116,7 @@ __kernel void m08100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -359,7 +359,7 @@ __kernel void m08100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -425,7 +425,7 @@ __kernel void m08100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m08100_a3.cl b/OpenCL/m08100_a3.cl index adfb28c97..0dac6eab7 100644 --- a/OpenCL/m08100_a3.cl +++ b/OpenCL/m08100_a3.cl @@ -5,6 +5,8 @@ #define _SHA1_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,9 +18,7 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" - -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" +#include "OpenCL/simd.c" static void m08100m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { @@ -48,24 +48,24 @@ static void m08100m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; /** * prepend salt */ - u32 w0_t[4]; - u32 w1_t[4]; - u32 w2_t[4]; - u32 w3_t[4]; + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; w0_t[0] = salt_buf0[0]; w0_t[1] = salt_buf0[1]; - w0_t[2] = w0[0]; + w0_t[2] = w0lr; w0_t[3] = w0[1]; w1_t[0] = w0[2]; w1_t[1] = w0[3]; @@ -84,11 +84,11 @@ static void m08100m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * sha1 */ - u32 a = SHA1M_A; - u32 b = SHA1M_B; - u32 c = SHA1M_C; - u32 d = SHA1M_D; - u32 e = SHA1M_E; + u32x a = SHA1M_A; + u32x b = SHA1M_B; + u32x c = SHA1M_C; + u32x d = SHA1M_D; + u32x e = SHA1M_E; #undef K #define K SHA1C00 @@ -186,12 +186,7 @@ static void m08100m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w3_t[2]); w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w3_t[3]); - const u32 r0 = d; - const u32 r1 = e; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_M + COMPARE_M_SIMD (d, e, c, b); } } @@ -220,7 +215,7 @@ static void m08100s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * reverse */ - const u32 e_rev = rotl32 (search[1], 2u); + const u32 e_rev = rotl32_S (search[1], 2u); /** * salt @@ -241,24 +236,24 @@ static void m08100s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; /** * prepend salt */ - u32 w0_t[4]; - u32 w1_t[4]; - u32 w2_t[4]; - u32 w3_t[4]; + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; w0_t[0] = salt_buf0[0]; w0_t[1] = salt_buf0[1]; - w0_t[2] = w0[0]; + w0_t[2] = w0lr; w0_t[3] = w0[1]; w1_t[0] = w0[2]; w1_t[1] = w0[3]; @@ -277,11 +272,11 @@ static void m08100s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * sha1 */ - u32 a = SHA1M_A; - u32 b = SHA1M_B; - u32 c = SHA1M_C; - u32 d = SHA1M_D; - u32 e = SHA1M_E; + u32x a = SHA1M_A; + u32x b = SHA1M_B; + u32x c = SHA1M_C; + u32x d = SHA1M_D; + u32x e = SHA1M_E; #undef K #define K SHA1C00 @@ -376,18 +371,13 @@ static void m08100s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w2_t[3] = rotl32 ((w2_t[0] ^ w0_t[3] ^ w3_t[1] ^ w2_t[3]), 1u); SHA1_STEP (SHA1_F1, a, b, c, d, e, w2_t[3]); w3_t[0] = rotl32 ((w2_t[1] ^ w1_t[0] ^ w3_t[2] ^ w3_t[0]), 1u); SHA1_STEP (SHA1_F1, e, a, b, c, d, w3_t[0]); - if (allx (e != e_rev)) continue; + if (MATCHES_NONE_VS (e, e_rev)) continue; w3_t[1] = rotl32 ((w2_t[2] ^ w1_t[1] ^ w3_t[3] ^ w3_t[1]), 1u); SHA1_STEP (SHA1_F1, d, e, a, b, c, w3_t[1]); w3_t[2] = rotl32 ((w2_t[3] ^ w1_t[2] ^ w0_t[0] ^ w3_t[2]), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, w3_t[2]); w3_t[3] = rotl32 ((w3_t[0] ^ w1_t[3] ^ w0_t[1] ^ w3_t[3]), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, w3_t[3]); - const u32 r0 = d; - const u32 r1 = e; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_S + COMPARE_S_SIMD (d, e, c, b); } } @@ -435,18 +425,18 @@ __kernel void m08100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - w0[0] = swap32 (w0[0]); - w0[1] = swap32 (w0[1]); - w0[2] = swap32 (w0[2]); - w0[3] = swap32 (w0[3]); + w0[0] = swap32_S (w0[0]); + w0[1] = swap32_S (w0[1]); + w0[2] = swap32_S (w0[2]); + w0[3] = swap32_S (w0[3]); - append_0x80_2x4 (w0, w1, pw_len + 1); + append_0x80_2x4_S (w0, w1, pw_len + 1); - w0[0] = swap32 (w0[0]); - w0[1] = swap32 (w0[1]); - w0[2] = swap32 (w0[2]); - w0[3] = swap32 (w0[3]); - w1[0] = swap32 (w1[0]); + w0[0] = swap32_S (w0[0]); + w0[1] = swap32_S (w0[1]); + w0[2] = swap32_S (w0[2]); + w0[3] = swap32_S (w0[3]); + w1[0] = swap32_S (w1[0]); /** * main @@ -499,26 +489,26 @@ __kernel void m08100_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - w0[0] = swap32 (w0[0]); - w0[1] = swap32 (w0[1]); - w0[2] = swap32 (w0[2]); - w0[3] = swap32 (w0[3]); - w1[0] = swap32 (w1[0]); - w1[1] = swap32 (w1[1]); - w1[2] = swap32 (w1[2]); - w1[3] = swap32 (w1[3]); + w0[0] = swap32_S (w0[0]); + w0[1] = swap32_S (w0[1]); + w0[2] = swap32_S (w0[2]); + w0[3] = swap32_S (w0[3]); + w1[0] = swap32_S (w1[0]); + w1[1] = swap32_S (w1[1]); + w1[2] = swap32_S (w1[2]); + w1[3] = swap32_S (w1[3]); - append_0x80_3x4 (w0, w1, w2, pw_len + 1); + append_0x80_3x4_S (w0, w1, w2, pw_len + 1); - w0[0] = swap32 (w0[0]); - w0[1] = swap32 (w0[1]); - w0[2] = swap32 (w0[2]); - w0[3] = swap32 (w0[3]); - w1[0] = swap32 (w1[0]); - w1[1] = swap32 (w1[1]); - w1[2] = swap32 (w1[2]); - w1[3] = swap32 (w1[3]); - w2[0] = swap32 (w2[0]); + w0[0] = swap32_S (w0[0]); + w0[1] = swap32_S (w0[1]); + w0[2] = swap32_S (w0[2]); + w0[3] = swap32_S (w0[3]); + w1[0] = swap32_S (w1[0]); + w1[1] = swap32_S (w1[1]); + w1[2] = swap32_S (w1[2]); + w1[3] = swap32_S (w1[3]); + w2[0] = swap32_S (w2[0]); /** * main @@ -571,39 +561,39 @@ __kernel void m08100_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - w0[0] = swap32 (w0[0]); - w0[1] = swap32 (w0[1]); - w0[2] = swap32 (w0[2]); - w0[3] = swap32 (w0[3]); - w1[0] = swap32 (w1[0]); - w1[1] = swap32 (w1[1]); - w1[2] = swap32 (w1[2]); - w1[3] = swap32 (w1[3]); - w2[0] = swap32 (w2[0]); - w2[1] = swap32 (w2[1]); - w2[2] = swap32 (w2[2]); - w2[3] = swap32 (w2[3]); - w3[0] = swap32 (w3[0]); - w3[1] = swap32 (w3[1]); + w0[0] = swap32_S (w0[0]); + w0[1] = swap32_S (w0[1]); + w0[2] = swap32_S (w0[2]); + w0[3] = swap32_S (w0[3]); + w1[0] = swap32_S (w1[0]); + w1[1] = swap32_S (w1[1]); + w1[2] = swap32_S (w1[2]); + w1[3] = swap32_S (w1[3]); + w2[0] = swap32_S (w2[0]); + w2[1] = swap32_S (w2[1]); + w2[2] = swap32_S (w2[2]); + w2[3] = swap32_S (w2[3]); + w3[0] = swap32_S (w3[0]); + w3[1] = swap32_S (w3[1]); w3[2] = 0; w3[3] = 0; - append_0x80_4x4 (w0, w1, w2, w3, pw_len + 1); + append_0x80_4x4_S (w0, w1, w2, w3, pw_len + 1); - w0[0] = swap32 (w0[0]); - w0[1] = swap32 (w0[1]); - w0[2] = swap32 (w0[2]); - w0[3] = swap32 (w0[3]); - w1[0] = swap32 (w1[0]); - w1[1] = swap32 (w1[1]); - w1[2] = swap32 (w1[2]); - w1[3] = swap32 (w1[3]); - w2[0] = swap32 (w2[0]); - w2[1] = swap32 (w2[1]); - w2[2] = swap32 (w2[2]); - w2[3] = swap32 (w2[3]); - w3[0] = swap32 (w3[0]); - w3[1] = swap32 (w3[1]); + w0[0] = swap32_S (w0[0]); + w0[1] = swap32_S (w0[1]); + w0[2] = swap32_S (w0[2]); + w0[3] = swap32_S (w0[3]); + w1[0] = swap32_S (w1[0]); + w1[1] = swap32_S (w1[1]); + w1[2] = swap32_S (w1[2]); + w1[3] = swap32_S (w1[3]); + w2[0] = swap32_S (w2[0]); + w2[1] = swap32_S (w2[1]); + w2[2] = swap32_S (w2[2]); + w2[3] = swap32_S (w2[3]); + w3[0] = swap32_S (w3[0]); + w3[1] = swap32_S (w3[1]); w3[2] = 0; w3[3] = 0; @@ -658,18 +648,18 @@ __kernel void m08100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - w0[0] = swap32 (w0[0]); - w0[1] = swap32 (w0[1]); - w0[2] = swap32 (w0[2]); - w0[3] = swap32 (w0[3]); + w0[0] = swap32_S (w0[0]); + w0[1] = swap32_S (w0[1]); + w0[2] = swap32_S (w0[2]); + w0[3] = swap32_S (w0[3]); - append_0x80_2x4 (w0, w1, pw_len + 1); + append_0x80_2x4_S (w0, w1, pw_len + 1); - w0[0] = swap32 (w0[0]); - w0[1] = swap32 (w0[1]); - w0[2] = swap32 (w0[2]); - w0[3] = swap32 (w0[3]); - w1[0] = swap32 (w1[0]); + w0[0] = swap32_S (w0[0]); + w0[1] = swap32_S (w0[1]); + w0[2] = swap32_S (w0[2]); + w0[3] = swap32_S (w0[3]); + w1[0] = swap32_S (w1[0]); /** * main @@ -722,26 +712,26 @@ __kernel void m08100_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - w0[0] = swap32 (w0[0]); - w0[1] = swap32 (w0[1]); - w0[2] = swap32 (w0[2]); - w0[3] = swap32 (w0[3]); - w1[0] = swap32 (w1[0]); - w1[1] = swap32 (w1[1]); - w1[2] = swap32 (w1[2]); - w1[3] = swap32 (w1[3]); + w0[0] = swap32_S (w0[0]); + w0[1] = swap32_S (w0[1]); + w0[2] = swap32_S (w0[2]); + w0[3] = swap32_S (w0[3]); + w1[0] = swap32_S (w1[0]); + w1[1] = swap32_S (w1[1]); + w1[2] = swap32_S (w1[2]); + w1[3] = swap32_S (w1[3]); - append_0x80_3x4 (w0, w1, w2, pw_len + 1); + append_0x80_3x4_S (w0, w1, w2, pw_len + 1); - w0[0] = swap32 (w0[0]); - w0[1] = swap32 (w0[1]); - w0[2] = swap32 (w0[2]); - w0[3] = swap32 (w0[3]); - w1[0] = swap32 (w1[0]); - w1[1] = swap32 (w1[1]); - w1[2] = swap32 (w1[2]); - w1[3] = swap32 (w1[3]); - w2[0] = swap32 (w2[0]); + w0[0] = swap32_S (w0[0]); + w0[1] = swap32_S (w0[1]); + w0[2] = swap32_S (w0[2]); + w0[3] = swap32_S (w0[3]); + w1[0] = swap32_S (w1[0]); + w1[1] = swap32_S (w1[1]); + w1[2] = swap32_S (w1[2]); + w1[3] = swap32_S (w1[3]); + w2[0] = swap32_S (w2[0]); /** * main @@ -794,39 +784,39 @@ __kernel void m08100_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * base */ - w0[0] = swap32 (w0[0]); - w0[1] = swap32 (w0[1]); - w0[2] = swap32 (w0[2]); - w0[3] = swap32 (w0[3]); - w1[0] = swap32 (w1[0]); - w1[1] = swap32 (w1[1]); - w1[2] = swap32 (w1[2]); - w1[3] = swap32 (w1[3]); - w2[0] = swap32 (w2[0]); - w2[1] = swap32 (w2[1]); - w2[2] = swap32 (w2[2]); - w2[3] = swap32 (w2[3]); - w3[0] = swap32 (w3[0]); - w3[1] = swap32 (w3[1]); + w0[0] = swap32_S (w0[0]); + w0[1] = swap32_S (w0[1]); + w0[2] = swap32_S (w0[2]); + w0[3] = swap32_S (w0[3]); + w1[0] = swap32_S (w1[0]); + w1[1] = swap32_S (w1[1]); + w1[2] = swap32_S (w1[2]); + w1[3] = swap32_S (w1[3]); + w2[0] = swap32_S (w2[0]); + w2[1] = swap32_S (w2[1]); + w2[2] = swap32_S (w2[2]); + w2[3] = swap32_S (w2[3]); + w3[0] = swap32_S (w3[0]); + w3[1] = swap32_S (w3[1]); w3[2] = 0; w3[3] = 0; - append_0x80_4x4 (w0, w1, w2, w3, pw_len + 1); + append_0x80_4x4_S (w0, w1, w2, w3, pw_len + 1); - w0[0] = swap32 (w0[0]); - w0[1] = swap32 (w0[1]); - w0[2] = swap32 (w0[2]); - w0[3] = swap32 (w0[3]); - w1[0] = swap32 (w1[0]); - w1[1] = swap32 (w1[1]); - w1[2] = swap32 (w1[2]); - w1[3] = swap32 (w1[3]); - w2[0] = swap32 (w2[0]); - w2[1] = swap32 (w2[1]); - w2[2] = swap32 (w2[2]); - w2[3] = swap32 (w2[3]); - w3[0] = swap32 (w3[0]); - w3[1] = swap32 (w3[1]); + w0[0] = swap32_S (w0[0]); + w0[1] = swap32_S (w0[1]); + w0[2] = swap32_S (w0[2]); + w0[3] = swap32_S (w0[3]); + w1[0] = swap32_S (w1[0]); + w1[1] = swap32_S (w1[1]); + w1[2] = swap32_S (w1[2]); + w1[3] = swap32_S (w1[3]); + w2[0] = swap32_S (w2[0]); + w2[1] = swap32_S (w2[1]); + w2[2] = swap32_S (w2[2]); + w2[3] = swap32_S (w2[3]); + w3[0] = swap32_S (w3[0]); + w3[1] = swap32_S (w3[1]); w3[2] = 0; w3[3] = 0; diff --git a/OpenCL/m08300_a0.cl b/OpenCL/m08300_a0.cl index ef248928b..2872e9f1f 100644 --- a/OpenCL/m08300_a0.cl +++ b/OpenCL/m08300_a0.cl @@ -278,7 +278,7 @@ __kernel void m08300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w3_t[2] = w3[2]; w3_t[3] = w3[3]; - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, 1); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, 1); w0_t[0] |= pw_len & 0xff; @@ -314,7 +314,7 @@ __kernel void m08300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, 1 + out_len + domain_len + 1); + switch_buffer_by_offset_le (s0, s1, s2, s3, 1 + out_len + domain_len + 1); u32 d0[4]; @@ -344,7 +344,7 @@ __kernel void m08300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, d3[2] = 0; d3[3] = 0; - switch_buffer_by_offset (d0, d1, d2, d3, 1 + out_len); + switch_buffer_by_offset_le (d0, d1, d2, d3, 1 + out_len); /** * sha1 @@ -586,7 +586,7 @@ __kernel void m08300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w3_t[2] = w3[2]; w3_t[3] = w3[3]; - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, 1); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, 1); w0_t[0] |= pw_len & 0xff; @@ -622,7 +622,7 @@ __kernel void m08300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, 1 + out_len + domain_len + 1); + switch_buffer_by_offset_le (s0, s1, s2, s3, 1 + out_len + domain_len + 1); u32 d0[4]; @@ -652,7 +652,7 @@ __kernel void m08300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, d3[2] = 0; d3[3] = 0; - switch_buffer_by_offset (d0, d1, d2, d3, 1 + out_len); + switch_buffer_by_offset_le (d0, d1, d2, d3, 1 + out_len); /** * sha1 diff --git a/OpenCL/m08300_a1.cl b/OpenCL/m08300_a1.cl index 49d7a16a0..74b7658bb 100644 --- a/OpenCL/m08300_a1.cl +++ b/OpenCL/m08300_a1.cl @@ -196,7 +196,7 @@ __kernel void m08300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -271,7 +271,7 @@ __kernel void m08300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -324,7 +324,7 @@ __kernel void m08300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w3_t[2] = w3[2]; w3_t[3] = w3[3]; - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, 1); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, 1); w0_t[0] |= pw_len & 0xff; @@ -360,7 +360,7 @@ __kernel void m08300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, 1 + pw_len + domain_len + 1); + switch_buffer_by_offset_le (s0, s1, s2, s3, 1 + pw_len + domain_len + 1); u32 d0[4]; @@ -390,7 +390,7 @@ __kernel void m08300_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, d3[2] = 0; d3[3] = 0; - switch_buffer_by_offset (d0, d1, d2, d3, 1 + pw_len); + switch_buffer_by_offset_le (d0, d1, d2, d3, 1 + pw_len); /** * sha1 @@ -540,7 +540,7 @@ __kernel void m08300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -627,7 +627,7 @@ __kernel void m08300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -680,7 +680,7 @@ __kernel void m08300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w3_t[2] = w3[2]; w3_t[3] = w3[3]; - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, 1); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, 1); w0_t[0] |= pw_len & 0xff; @@ -716,7 +716,7 @@ __kernel void m08300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, 1 + pw_len + domain_len + 1); + switch_buffer_by_offset_le (s0, s1, s2, s3, 1 + pw_len + domain_len + 1); u32 d0[4]; @@ -746,7 +746,7 @@ __kernel void m08300_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, d3[2] = 0; d3[3] = 0; - switch_buffer_by_offset (d0, d1, d2, d3, 1 + pw_len); + switch_buffer_by_offset_le (d0, d1, d2, d3, 1 + pw_len); /** * sha1 diff --git a/OpenCL/m08300_a3.cl b/OpenCL/m08300_a3.cl index ec49b98c0..051c90576 100644 --- a/OpenCL/m08300_a3.cl +++ b/OpenCL/m08300_a3.cl @@ -5,6 +5,8 @@ #define _SHA1_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,34 +18,32 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" +#include "OpenCL/simd.c" -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" - -static void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) +static void sha1_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[5]) { - u32 A = digest[0]; - u32 B = digest[1]; - u32 C = digest[2]; - u32 D = digest[3]; - u32 E = digest[4]; + u32x A = digest[0]; + u32x B = digest[1]; + u32x C = digest[2]; + u32x D = digest[3]; + u32x E = digest[4]; - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; + u32x w0_t = w0[0]; + u32x w1_t = w0[1]; + u32x w2_t = w0[2]; + u32x w3_t = w0[3]; + u32x w4_t = w1[0]; + u32x w5_t = w1[1]; + u32x w6_t = w1[2]; + u32x w7_t = w1[3]; + u32x w8_t = w2[0]; + u32x w9_t = w2[1]; + u32x wa_t = w2[2]; + u32x wb_t = w2[3]; + u32x wc_t = w3[0]; + u32x wd_t = w3[1]; + u32x we_t = w3[2]; + u32x wf_t = w3[3]; #undef K #define K SHA1C00 @@ -227,7 +227,7 @@ static void m08300m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, 1 + pw_len + domain_len + 1); + switch_buffer_by_offset_le_S (s0, s1, s2, s3, 1 + pw_len + domain_len + 1); u32 d0[4]; @@ -257,7 +257,7 @@ static void m08300m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le d3[2] = 0; d3[3] = 0; - switch_buffer_by_offset (d0, d1, d2, d3, 1 + pw_len); + switch_buffer_by_offset_le_S (d0, d1, d2, d3, 1 + pw_len); /** * loop @@ -265,41 +265,41 @@ static void m08300m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - u32 w0_t[4]; + u32x w0_t[4]; - w0_t[0] = w0[0]; + w0_t[0] = w0lr; w0_t[1] = w0[1]; w0_t[2] = w0[2]; w0_t[3] = w0[3]; - u32 w1_t[4]; + u32x w1_t[4]; w1_t[0] = w1[0]; w1_t[1] = w1[1]; w1_t[2] = w1[2]; w1_t[3] = w1[3]; - u32 w2_t[4]; + u32x w2_t[4]; w2_t[0] = w2[0]; w2_t[1] = w2[1]; w2_t[2] = w2[2]; w2_t[3] = w2[3]; - u32 w3_t[4]; + u32x w3_t[4]; w3_t[0] = w3[0]; w3_t[1] = w3[1]; w3_t[2] = w3[2]; w3_t[3] = w3[3]; - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, 1); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, 1); w0_t[0] |= pw_len & 0xff; @@ -307,35 +307,35 @@ static void m08300m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * sha1 */ - u32 w0_t2[4]; + u32x w0_t2[4]; w0_t2[0] = swap32 (w0_t[0] | d0[0] | s0[0]); w0_t2[1] = swap32 (w0_t[1] | d0[1] | s0[1]); w0_t2[2] = swap32 (w0_t[2] | d0[2] | s0[2]); w0_t2[3] = swap32 (w0_t[3] | d0[3] | s0[3]); - u32 w1_t2[4]; + u32x w1_t2[4]; w1_t2[0] = swap32 (w1_t[0] | d1[0] | s1[0]); w1_t2[1] = swap32 (w1_t[1] | d1[1] | s1[1]); w1_t2[2] = swap32 (w1_t[2] | d1[2] | s1[2]); w1_t2[3] = swap32 (w1_t[3] | d1[3] | s1[3]); - u32 w2_t2[4]; + u32x w2_t2[4]; w2_t2[0] = swap32 (w2_t[0] | d2[0] | s2[0]); w2_t2[1] = swap32 (w2_t[1] | d2[1] | s2[1]); w2_t2[2] = swap32 (w2_t[2] | d2[2] | s2[2]); w2_t2[3] = swap32 (w2_t[3] | d2[3] | s2[3]); - u32 w3_t2[4]; + u32x w3_t2[4]; w3_t2[0] = swap32 (w3_t[0] | d3[0] | s3[0]); w3_t2[1] = swap32 (w3_t[1] | d3[1] | s3[1]); w3_t2[2] = 0; w3_t2[3] = (1 + pw_len + domain_len + 1 + salt_len) * 8; - u32 digest[5]; + u32x digest[5]; digest[0] = SHA1M_A; digest[1] = SHA1M_B; @@ -349,28 +349,28 @@ static void m08300m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le for (u32 i = 0; i < salt_iter; i++) { - u32 w0_t3[4]; + u32x w0_t3[4]; w0_t3[0] = digest[0]; w0_t3[1] = digest[1]; w0_t3[2] = digest[2]; w0_t3[3] = digest[3]; - u32 w1_t3[4]; + u32x w1_t3[4]; w1_t3[0] = digest[4]; w1_t3[1] = swap32 (salt_buf0[0]); w1_t3[2] = swap32 (salt_buf0[1]); w1_t3[3] = swap32 (salt_buf0[2]); - u32 w2_t3[4]; + u32x w2_t3[4]; w2_t3[0] = swap32 (salt_buf0[3]); w2_t3[1] = swap32 (salt_buf1[0]); w2_t3[2] = swap32 (salt_buf1[1]); w2_t3[3] = swap32 (salt_buf1[2]); - u32 w3_t3[4]; + u32x w3_t3[4]; w3_t3[0] = swap32 (salt_buf1[3]); w3_t3[1] = 0; @@ -386,12 +386,7 @@ static void m08300m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le sha1_transform (w0_t3, w1_t3, w2_t3, w3_t3, digest); } - const u32 r0 = digest[3]; - const u32 r1 = digest[4]; - const u32 r2 = digest[2]; - const u32 r3 = digest[1]; - - #include COMPARE_M + COMPARE_M_SIMD (digest[3], digest[4], digest[2], digest[1]); } } @@ -486,7 +481,7 @@ static void m08300s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le s3[2] = 0; s3[3] = 0; - switch_buffer_by_offset (s0, s1, s2, s3, 1 + pw_len + domain_len + 1); + switch_buffer_by_offset_le_S (s0, s1, s2, s3, 1 + pw_len + domain_len + 1); u32 d0[4]; @@ -516,7 +511,7 @@ static void m08300s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le d3[2] = 0; d3[3] = 0; - switch_buffer_by_offset (d0, d1, d2, d3, 1 + pw_len); + switch_buffer_by_offset_le_S (d0, d1, d2, d3, 1 + pw_len); /** * loop @@ -524,41 +519,41 @@ static void m08300s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - u32 w0_t[4]; + u32x w0_t[4]; - w0_t[0] = w0[0]; + w0_t[0] = w0lr; w0_t[1] = w0[1]; w0_t[2] = w0[2]; w0_t[3] = w0[3]; - u32 w1_t[4]; + u32x w1_t[4]; w1_t[0] = w1[0]; w1_t[1] = w1[1]; w1_t[2] = w1[2]; w1_t[3] = w1[3]; - u32 w2_t[4]; + u32x w2_t[4]; w2_t[0] = w2[0]; w2_t[1] = w2[1]; w2_t[2] = w2[2]; w2_t[3] = w2[3]; - u32 w3_t[4]; + u32x w3_t[4]; w3_t[0] = w3[0]; w3_t[1] = w3[1]; w3_t[2] = w3[2]; w3_t[3] = w3[3]; - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, 1); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, 1); w0_t[0] |= pw_len & 0xff; @@ -566,35 +561,35 @@ static void m08300s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * sha1 */ - u32 w0_t2[4]; + u32x w0_t2[4]; w0_t2[0] = swap32 (w0_t[0] | d0[0] | s0[0]); w0_t2[1] = swap32 (w0_t[1] | d0[1] | s0[1]); w0_t2[2] = swap32 (w0_t[2] | d0[2] | s0[2]); w0_t2[3] = swap32 (w0_t[3] | d0[3] | s0[3]); - u32 w1_t2[4]; + u32x w1_t2[4]; w1_t2[0] = swap32 (w1_t[0] | d1[0] | s1[0]); w1_t2[1] = swap32 (w1_t[1] | d1[1] | s1[1]); w1_t2[2] = swap32 (w1_t[2] | d1[2] | s1[2]); w1_t2[3] = swap32 (w1_t[3] | d1[3] | s1[3]); - u32 w2_t2[4]; + u32x w2_t2[4]; w2_t2[0] = swap32 (w2_t[0] | d2[0] | s2[0]); w2_t2[1] = swap32 (w2_t[1] | d2[1] | s2[1]); w2_t2[2] = swap32 (w2_t[2] | d2[2] | s2[2]); w2_t2[3] = swap32 (w2_t[3] | d2[3] | s2[3]); - u32 w3_t2[4]; + u32x w3_t2[4]; w3_t2[0] = swap32 (w3_t[0] | d3[0] | s3[0]); w3_t2[1] = swap32 (w3_t[1] | d3[1] | s3[1]); w3_t2[2] = 0; w3_t2[3] = (1 + pw_len + domain_len + 1 + salt_len) * 8; - u32 digest[5]; + u32x digest[5]; digest[0] = SHA1M_A; digest[1] = SHA1M_B; @@ -608,28 +603,28 @@ static void m08300s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le for (u32 i = 0; i < salt_iter; i++) { - u32 w0_t3[4]; + u32x w0_t3[4]; w0_t3[0] = digest[0]; w0_t3[1] = digest[1]; w0_t3[2] = digest[2]; w0_t3[3] = digest[3]; - u32 w1_t3[4]; + u32x w1_t3[4]; w1_t3[0] = digest[4]; w1_t3[1] = swap32 (salt_buf0[0]); w1_t3[2] = swap32 (salt_buf0[1]); w1_t3[3] = swap32 (salt_buf0[2]); - u32 w2_t3[4]; + u32x w2_t3[4]; w2_t3[0] = swap32 (salt_buf0[3]); w2_t3[1] = swap32 (salt_buf1[0]); w2_t3[2] = swap32 (salt_buf1[1]); w2_t3[3] = swap32 (salt_buf1[2]); - u32 w3_t3[4]; + u32x w3_t3[4]; w3_t3[0] = swap32 (salt_buf1[3]); w3_t3[1] = 0; @@ -645,12 +640,7 @@ static void m08300s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le sha1_transform (w0_t3, w1_t3, w2_t3, w3_t3, digest); } - const u32 r0 = digest[3]; - const u32 r1 = digest[4]; - const u32 r2 = digest[2]; - const u32 r3 = digest[1]; - - #include COMPARE_S + COMPARE_S_SIMD (digest[3], digest[4], digest[2], digest[1]); } } diff --git a/OpenCL/m08400_a1.cl b/OpenCL/m08400_a1.cl index 079189dba..be3048bcc 100644 --- a/OpenCL/m08400_a1.cl +++ b/OpenCL/m08400_a1.cl @@ -217,7 +217,7 @@ __kernel void m08400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -289,7 +289,7 @@ __kernel void m08400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordr0, wordr1, pw_r_len); - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -568,7 +568,7 @@ __kernel void m08400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -652,7 +652,7 @@ __kernel void m08400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordr0, wordr1, pw_r_len); - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m08400_a3.cl b/OpenCL/m08400_a3.cl index 4bbd358a5..181f0152e 100644 --- a/OpenCL/m08400_a3.cl +++ b/OpenCL/m08400_a3.cl @@ -5,6 +5,8 @@ #define _SHA1_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,36 +18,42 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" +#include "OpenCL/simd.c" -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" +#if VECT_SIZE == 1 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8_le(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#endif -#define uint_to_hex_lower8_le(i) l_bin2asc[(i)] - -static void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) +static void sha1_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[5]) { - u32 A = digest[0]; - u32 B = digest[1]; - u32 C = digest[2]; - u32 D = digest[3]; - u32 E = digest[4]; + u32x A = digest[0]; + u32x B = digest[1]; + u32x C = digest[2]; + u32x D = digest[3]; + u32x E = digest[4]; - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; + u32x w0_t = w0[0]; + u32x w1_t = w0[1]; + u32x w2_t = w0[2]; + u32x w3_t = w0[3]; + u32x w4_t = w1[0]; + u32x w5_t = w1[1]; + u32x w6_t = w1[2]; + u32x w7_t = w1[3]; + u32x w8_t = w2[0]; + u32x w9_t = w2[1]; + u32x wa_t = w2[2]; + u32x wb_t = w2[3]; + u32x wc_t = w3[0]; + u32x wd_t = w3[1]; + u32x we_t = w3[2]; + u32x wf_t = w3[3]; #undef K #define K SHA1C00 @@ -165,22 +173,22 @@ static void m08400m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 salt_buf0[4]; - salt_buf0[0] = swap32 (salt_bufs[salt_pos].salt_buf[ 0]); - salt_buf0[1] = swap32 (salt_bufs[salt_pos].salt_buf[ 1]); - salt_buf0[2] = swap32 (salt_bufs[salt_pos].salt_buf[ 2]); - salt_buf0[3] = swap32 (salt_bufs[salt_pos].salt_buf[ 3]); + salt_buf0[0] = swap32_S (salt_bufs[salt_pos].salt_buf[ 0]); + salt_buf0[1] = swap32_S (salt_bufs[salt_pos].salt_buf[ 1]); + salt_buf0[2] = swap32_S (salt_bufs[salt_pos].salt_buf[ 2]); + salt_buf0[3] = swap32_S (salt_bufs[salt_pos].salt_buf[ 3]); u32 salt_buf1[4]; - salt_buf1[0] = swap32 (salt_bufs[salt_pos].salt_buf[ 4]); - salt_buf1[1] = swap32 (salt_bufs[salt_pos].salt_buf[ 5]); - salt_buf1[2] = swap32 (salt_bufs[salt_pos].salt_buf[ 6]); - salt_buf1[3] = swap32 (salt_bufs[salt_pos].salt_buf[ 7]); + salt_buf1[0] = swap32_S (salt_bufs[salt_pos].salt_buf[ 4]); + salt_buf1[1] = swap32_S (salt_bufs[salt_pos].salt_buf[ 5]); + salt_buf1[2] = swap32_S (salt_bufs[salt_pos].salt_buf[ 6]); + salt_buf1[3] = swap32_S (salt_bufs[salt_pos].salt_buf[ 7]); u32 salt_buf2[4]; - salt_buf2[0] = swap32 (salt_bufs[salt_pos].salt_buf[ 8]); - salt_buf2[1] = swap32 (salt_bufs[salt_pos].salt_buf[ 9]); + salt_buf2[0] = swap32_S (salt_bufs[salt_pos].salt_buf[ 8]); + salt_buf2[1] = swap32_S (salt_bufs[salt_pos].salt_buf[ 9]); salt_buf2[2] = 0; salt_buf2[3] = 0; @@ -192,41 +200,41 @@ static void m08400m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - u32 w0_t[4]; + u32x w0_t[4]; - w0_t[0] = w0[0]; + w0_t[0] = w0lr; w0_t[1] = w0[1]; w0_t[2] = w0[2]; w0_t[3] = w0[3]; - u32 w1_t[4]; + u32x w1_t[4]; w1_t[0] = w1[0]; w1_t[1] = w1[1]; w1_t[2] = w1[2]; w1_t[3] = w1[3]; - u32 w2_t[4]; + u32x w2_t[4]; w2_t[0] = w2[0]; w2_t[1] = w2[1]; w2_t[2] = w2[2]; w2_t[3] = w2[3]; - u32 w3_t[4]; + u32x w3_t[4]; w3_t[0] = w3[0]; w3_t[1] = w3[1]; w3_t[2] = 0; w3_t[3] = pw_len * 8; - u32 digest[5]; + u32x digest[5]; digest[0] = SHA1M_A; digest[1] = SHA1M_B; @@ -236,11 +244,11 @@ static void m08400m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); - u32 a; - u32 b; - u32 c; - u32 d; - u32 e; + u32x a; + u32x b; + u32x c; + u32x d; + u32x e; a = digest[0]; b = digest[1]; @@ -362,12 +370,7 @@ static void m08400m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); - const u32 r0 = digest[3]; - const u32 r1 = digest[4]; - const u32 r2 = digest[2]; - const u32 r3 = digest[1]; - - #include COMPARE_M + COMPARE_M_SIMD (digest[3], digest[4], digest[2], digest[1]); } } @@ -398,22 +401,22 @@ static void m08400s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 salt_buf0[4]; - salt_buf0[0] = swap32 (salt_bufs[salt_pos].salt_buf[ 0]); - salt_buf0[1] = swap32 (salt_bufs[salt_pos].salt_buf[ 1]); - salt_buf0[2] = swap32 (salt_bufs[salt_pos].salt_buf[ 2]); - salt_buf0[3] = swap32 (salt_bufs[salt_pos].salt_buf[ 3]); + salt_buf0[0] = swap32_S (salt_bufs[salt_pos].salt_buf[ 0]); + salt_buf0[1] = swap32_S (salt_bufs[salt_pos].salt_buf[ 1]); + salt_buf0[2] = swap32_S (salt_bufs[salt_pos].salt_buf[ 2]); + salt_buf0[3] = swap32_S (salt_bufs[salt_pos].salt_buf[ 3]); u32 salt_buf1[4]; - salt_buf1[0] = swap32 (salt_bufs[salt_pos].salt_buf[ 4]); - salt_buf1[1] = swap32 (salt_bufs[salt_pos].salt_buf[ 5]); - salt_buf1[2] = swap32 (salt_bufs[salt_pos].salt_buf[ 6]); - salt_buf1[3] = swap32 (salt_bufs[salt_pos].salt_buf[ 7]); + salt_buf1[0] = swap32_S (salt_bufs[salt_pos].salt_buf[ 4]); + salt_buf1[1] = swap32_S (salt_bufs[salt_pos].salt_buf[ 5]); + salt_buf1[2] = swap32_S (salt_bufs[salt_pos].salt_buf[ 6]); + salt_buf1[3] = swap32_S (salt_bufs[salt_pos].salt_buf[ 7]); u32 salt_buf2[4]; - salt_buf2[0] = swap32 (salt_bufs[salt_pos].salt_buf[ 8]); - salt_buf2[1] = swap32 (salt_bufs[salt_pos].salt_buf[ 9]); + salt_buf2[0] = swap32_S (salt_bufs[salt_pos].salt_buf[ 8]); + salt_buf2[1] = swap32_S (salt_bufs[salt_pos].salt_buf[ 9]); salt_buf2[2] = 0; salt_buf2[3] = 0; @@ -425,41 +428,41 @@ static void m08400s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - u32 w0_t[4]; + u32x w0_t[4]; - w0_t[0] = w0[0]; + w0_t[0] = w0lr; w0_t[1] = w0[1]; w0_t[2] = w0[2]; w0_t[3] = w0[3]; - u32 w1_t[4]; + u32x w1_t[4]; w1_t[0] = w1[0]; w1_t[1] = w1[1]; w1_t[2] = w1[2]; w1_t[3] = w1[3]; - u32 w2_t[4]; + u32x w2_t[4]; w2_t[0] = w2[0]; w2_t[1] = w2[1]; w2_t[2] = w2[2]; w2_t[3] = w2[3]; - u32 w3_t[4]; + u32x w3_t[4]; w3_t[0] = w3[0]; w3_t[1] = w3[1]; w3_t[2] = 0; w3_t[3] = pw_len * 8; - u32 digest[5]; + u32x digest[5]; digest[0] = SHA1M_A; digest[1] = SHA1M_B; @@ -469,11 +472,11 @@ static void m08400s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); - u32 a; - u32 b; - u32 c; - u32 d; - u32 e; + u32x a; + u32x b; + u32x c; + u32x d; + u32x e; a = digest[0]; b = digest[1]; @@ -595,12 +598,7 @@ static void m08400s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); - const u32 r0 = digest[3]; - const u32 r1 = digest[4]; - const u32 r2 = digest[2]; - const u32 r3 = digest[1]; - - #include COMPARE_S + COMPARE_S_SIMD (digest[3], digest[4], digest[2], digest[1]); } } diff --git a/OpenCL/m08500_a1.cl b/OpenCL/m08500_a1.cl index b773ba136..dfd09924f 100644 --- a/OpenCL/m08500_a1.cl +++ b/OpenCL/m08500_a1.cl @@ -581,7 +581,7 @@ __kernel void m08500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -635,7 +635,7 @@ __kernel void m08500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -782,7 +782,7 @@ __kernel void m08500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -848,7 +848,7 @@ __kernel void m08500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m08500_a3.cl b/OpenCL/m08500_a3.cl index c7ca8f58d..1fdf52dab 100644 --- a/OpenCL/m08500_a3.cl +++ b/OpenCL/m08500_a3.cl @@ -5,6 +5,8 @@ #define _DES_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,9 +18,7 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" - -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" +#include "OpenCL/simd.c" #define PERM_OP(a,b,tt,n,m) \ { \ @@ -376,20 +376,36 @@ __constant u32 c_skb[8][64] = } }; +#if VECT_SIZE == 1 #define BOX(i,n,S) (S)[(n)][(i)] +#elif VECT_SIZE == 2 +#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1]) +#elif VECT_SIZE == 4 +#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3]) +#elif VECT_SIZE == 8 +#define BOX(i,n,S) (u32x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7]) +#endif -static void _des_crypt_encrypt (u32 iv[2], u32 data[2], u32 Kc[16], u32 Kd[16], __local u32 s_SPtrans[8][64]) +#if VECT_SIZE == 1 +#define BOX1(i,S) (S)[(i)] +#elif VECT_SIZE == 2 +#define BOX1(i,S) (u32x) ((S)[(i).s0], (S)[(i).s1]) +#elif VECT_SIZE == 4 +#define BOX1(i,S) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3]) +#elif VECT_SIZE == 8 +#define BOX1(i,S) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7]) +#endif + +static void _des_crypt_encrypt (u32x iv[2], u32x data[2], u32x Kc[16], u32x Kd[16], __local u32 s_SPtrans[8][64]) { - u32 tt; - - u32 r = data[0]; - u32 l = data[1]; + u32x r = data[0]; + u32x l = data[1]; #pragma unroll 16 for (u32 i = 0; i < 16; i += 2) { - u32 u; - u32 t; + u32x u; + u32x t; u = Kc[i + 0] ^ r; t = Kd[i + 0] ^ rotl32 (r, 28u); @@ -420,9 +436,9 @@ static void _des_crypt_encrypt (u32 iv[2], u32 data[2], u32 Kc[16], u32 Kd[16], iv[1] = r; } -static void _des_crypt_keysetup (u32 c, u32 d, u32 Kc[16], u32 Kd[16], __local u32 s_skb[8][64]) +static void _des_crypt_keysetup (u32x c, u32x d, u32x Kc[16], u32x Kd[16], __local u32 s_skb[8][64]) { - u32 tt; + u32x tt; PERM_OP (d, c, tt, 4, 0x0f0f0f0f); HPERM_OP (c, tt, 2, 0xcccc0000); @@ -455,32 +471,32 @@ static void _des_crypt_keysetup (u32 c, u32 d, u32 Kc[16], u32 Kd[16], __local u c = c & 0x0fffffff; d = d & 0x0fffffff; - const u32 c00 = (c >> 0) & 0x0000003f; - const u32 c06 = (c >> 6) & 0x00383003; - const u32 c07 = (c >> 7) & 0x0000003c; - const u32 c13 = (c >> 13) & 0x0000060f; - const u32 c20 = (c >> 20) & 0x00000001; + const u32x c00 = (c >> 0) & 0x0000003f; + const u32x c06 = (c >> 6) & 0x00383003; + const u32x c07 = (c >> 7) & 0x0000003c; + const u32x c13 = (c >> 13) & 0x0000060f; + const u32x c20 = (c >> 20) & 0x00000001; - u32 s = BOX (((c00 >> 0) & 0xff), 0, s_skb) - | BOX (((c06 >> 0) & 0xff) - |((c07 >> 0) & 0xff), 1, s_skb) - | BOX (((c13 >> 0) & 0xff) - |((c06 >> 8) & 0xff), 2, s_skb) - | BOX (((c20 >> 0) & 0xff) - |((c13 >> 8) & 0xff) - |((c06 >> 16) & 0xff), 3, s_skb); + u32x s = BOX (((c00 >> 0) & 0xff), 0, s_skb) + | BOX (((c06 >> 0) & 0xff) + |((c07 >> 0) & 0xff), 1, s_skb) + | BOX (((c13 >> 0) & 0xff) + |((c06 >> 8) & 0xff), 2, s_skb) + | BOX (((c20 >> 0) & 0xff) + |((c13 >> 8) & 0xff) + |((c06 >> 16) & 0xff), 3, s_skb); - const u32 d00 = (d >> 0) & 0x00003c3f; - const u32 d07 = (d >> 7) & 0x00003f03; - const u32 d21 = (d >> 21) & 0x0000000f; - const u32 d22 = (d >> 22) & 0x00000030; + const u32x d00 = (d >> 0) & 0x00003c3f; + const u32x d07 = (d >> 7) & 0x00003f03; + const u32x d21 = (d >> 21) & 0x0000000f; + const u32x d22 = (d >> 22) & 0x00000030; - u32 t = BOX (((d00 >> 0) & 0xff), 4, s_skb) - | BOX (((d07 >> 0) & 0xff) - |((d00 >> 8) & 0xff), 5, s_skb) - | BOX (((d07 >> 8) & 0xff), 6, s_skb) - | BOX (((d21 >> 0) & 0xff) - |((d22 >> 0) & 0xff), 7, s_skb); + u32x t = BOX (((d00 >> 0) & 0xff), 4, s_skb) + | BOX (((d07 >> 0) & 0xff) + |((d00 >> 8) & 0xff), 5, s_skb) + | BOX (((d07 >> 8) & 0xff), 6, s_skb) + | BOX (((d21 >> 0) & 0xff) + |((d22 >> 0) & 0xff), 7, s_skb); Kc[i] = ((t << 16) | (s & 0x0000ffff)); Kd[i] = ((s >> 16) | (t & 0xffff0000)); @@ -490,20 +506,20 @@ static void _des_crypt_keysetup (u32 c, u32 d, u32 Kc[16], u32 Kd[16], __local u } } -static void transform_racf_key (const u32 w0, const u32 w1, u32 key[2]) +static void transform_racf_key (const u32x w0, const u32x w1, u32x key[2]) { - key[0] = (ascii_to_ebcdic_pc[(w0 >> 0) & 0xff]) << 0 - | (ascii_to_ebcdic_pc[(w0 >> 8) & 0xff]) << 8 - | (ascii_to_ebcdic_pc[(w0 >> 16) & 0xff]) << 16 - | (ascii_to_ebcdic_pc[(w0 >> 24) & 0xff]) << 24; + key[0] = BOX1 (((w0 >> 0) & 0xff), ascii_to_ebcdic_pc) << 0 + | BOX1 (((w0 >> 8) & 0xff), ascii_to_ebcdic_pc) << 8 + | BOX1 (((w0 >> 16) & 0xff), ascii_to_ebcdic_pc) << 16 + | BOX1 (((w0 >> 24) & 0xff), ascii_to_ebcdic_pc) << 24; - key[1] = (ascii_to_ebcdic_pc[(w1 >> 0) & 0xff]) << 0 - | (ascii_to_ebcdic_pc[(w1 >> 8) & 0xff]) << 8 - | (ascii_to_ebcdic_pc[(w1 >> 16) & 0xff]) << 16 - | (ascii_to_ebcdic_pc[(w1 >> 24) & 0xff]) << 24; + key[1] = BOX1 (((w1 >> 0) & 0xff), ascii_to_ebcdic_pc) << 0 + | BOX1 (((w1 >> 8) & 0xff), ascii_to_ebcdic_pc) << 8 + | BOX1 (((w1 >> 16) & 0xff), ascii_to_ebcdic_pc) << 16 + | BOX1 (((w1 >> 24) & 0xff), ascii_to_ebcdic_pc) << 24; } -static void m08500m (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m08500m (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -529,43 +545,41 @@ static void m08500m (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 u32 w1 = w[1]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - u32 key[2]; + u32x key[2]; transform_racf_key (w0, w1, key); - const u32 c = key[0]; - const u32 d = key[1]; + const u32x c = key[0]; + const u32x d = key[1]; - u32 Kc[16]; - u32 Kd[16]; + u32x Kc[16]; + u32x Kd[16]; _des_crypt_keysetup (c, d, Kc, Kd, s_skb); - u32 data[2]; + u32x data[2]; data[0] = salt_buf0[0]; data[1] = salt_buf0[1]; - u32 iv[2]; + u32x iv[2]; _des_crypt_encrypt (iv, data, Kc, Kd, s_SPtrans); - const u32 r0 = iv[0]; - const u32 r1 = iv[1]; - const u32 r2 = 0; - const u32 r3 = 0; + u32x iv2 = 0; + u32x iv3 = 0; - #include COMPARE_M + COMPARE_M_SIMD (iv[0], iv[1], iv2, iv3); } } -static void m08500s (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m08500s (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -603,43 +617,41 @@ static void m08500s (__local u32 s_SPtrans[8][64], __local u32 s_skb[8][64], u32 u32 w1 = w[1]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - u32 key[2]; + u32x key[2]; transform_racf_key (w0, w1, key); - const u32 c = key[0]; - const u32 d = key[1]; + const u32x c = key[0]; + const u32x d = key[1]; - u32 Kc[16]; - u32 Kd[16]; + u32x Kc[16]; + u32x Kd[16]; _des_crypt_keysetup (c, d, Kc, Kd, s_skb); - u32 data[2]; + u32x data[2]; data[0] = salt_buf0[0]; data[1] = salt_buf0[1]; - u32 iv[2]; + u32x iv[2]; _des_crypt_encrypt (iv, data, Kc, Kd, s_SPtrans); - const u32 r0 = iv[0]; - const u32 r1 = iv[1]; - const u32 r2 = 0; - const u32 r3 = 0; + u32x iv2 = 0; + u32x iv3 = 0; - #include COMPARE_S + COMPARE_S_SIMD (iv[0], iv[1], iv2, iv3); } } -__kernel void m08500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -713,15 +725,15 @@ __kernel void m08500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m08500m (s_SPtrans, s_skb, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m08500_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08500_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { } -__kernel void m08500_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08500_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { } -__kernel void m08500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -795,10 +807,10 @@ __kernel void m08500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m08500s (s_SPtrans, s_skb, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m08500_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08500_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { } -__kernel void m08500_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08500_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { } diff --git a/OpenCL/m08600_a1.cl b/OpenCL/m08600_a1.cl index 3030a58c6..83956c399 100644 --- a/OpenCL/m08600_a1.cl +++ b/OpenCL/m08600_a1.cl @@ -291,7 +291,7 @@ __kernel void m08600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -334,7 +334,7 @@ __kernel void m08600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w[16]; @@ -464,7 +464,7 @@ __kernel void m08600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -519,7 +519,7 @@ __kernel void m08600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w[16]; diff --git a/OpenCL/m08600_a3.cl b/OpenCL/m08600_a3.cl index f8547b274..908801697 100644 --- a/OpenCL/m08600_a3.cl +++ b/OpenCL/m08600_a3.cl @@ -5,6 +5,8 @@ #define _LOTUS5_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,9 +18,7 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" - -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" +#include "OpenCL/simd.c" __constant u32 lotus_magic_table[256] = { @@ -56,11 +56,19 @@ __constant u32 lotus_magic_table[256] = 0x29, 0x39, 0xb9, 0xe9, 0x4c, 0xff, 0x43, 0xab, }; -#define BOX(S,i) (S)[(i)] +#if VECT_SIZE == 1 +#define BOX1(S,i) (S)[(i)] +#elif VECT_SIZE == 2 +#define BOX1(S,i) (u32x) ((S)[(i).s0], (S)[(i).s1]) +#elif VECT_SIZE == 4 +#define BOX1(S,i) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3]) +#elif VECT_SIZE == 8 +#define BOX1(S,i) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7]) +#endif -static void lotus_mix (u32 *in, __local u32 s_lotus_magic_table[256]) +static void lotus_mix (u32x *in, __local u32 s_lotus_magic_table[256]) { - u32 p = 0; + u32x p = 0; for (int i = 0; i < 18; i++) { @@ -69,32 +77,32 @@ static void lotus_mix (u32 *in, __local u32 s_lotus_magic_table[256]) #pragma unroll 12 for (int j = 0; j < 12; j++) { - u32 tmp_in = in[j]; - u32 tmp_out = 0; + u32x tmp_in = in[j]; + u32x tmp_out = 0; - p = (p + s--) & 0xff; p = ((tmp_in >> 0) & 0xff) ^ BOX (s_lotus_magic_table, p); tmp_out |= p << 0; - p = (p + s--) & 0xff; p = ((tmp_in >> 8) & 0xff) ^ BOX (s_lotus_magic_table, p); tmp_out |= p << 8; - p = (p + s--) & 0xff; p = ((tmp_in >> 16) & 0xff) ^ BOX (s_lotus_magic_table, p); tmp_out |= p << 16; - p = (p + s--) & 0xff; p = ((tmp_in >> 24) & 0xff) ^ BOX (s_lotus_magic_table, p); tmp_out |= p << 24; + p = (p + s--) & 0xff; p = ((tmp_in >> 0) & 0xff) ^ BOX1 (s_lotus_magic_table, p); tmp_out |= p << 0; + p = (p + s--) & 0xff; p = ((tmp_in >> 8) & 0xff) ^ BOX1 (s_lotus_magic_table, p); tmp_out |= p << 8; + p = (p + s--) & 0xff; p = ((tmp_in >> 16) & 0xff) ^ BOX1 (s_lotus_magic_table, p); tmp_out |= p << 16; + p = (p + s--) & 0xff; p = ((tmp_in >> 24) & 0xff) ^ BOX1 (s_lotus_magic_table, p); tmp_out |= p << 24; in[j] = tmp_out; } } } -static void lotus_transform_password (u32 in[4], u32 out[4], __local u32 s_lotus_magic_table[256]) +static void lotus_transform_password (u32x in[4], u32x out[4], __local u32 s_lotus_magic_table[256]) { - u32 t = out[3] >> 24; + u32x t = out[3] >> 24; - u32 c; + u32x c; #pragma unroll 4 for (int i = 0; i < 4; i++) { - t ^= (in[i] >> 0) & 0xff; c = BOX (s_lotus_magic_table, t); out[i] ^= c << 0; t = ((out[i] >> 0) & 0xff); - t ^= (in[i] >> 8) & 0xff; c = BOX (s_lotus_magic_table, t); out[i] ^= c << 8; t = ((out[i] >> 8) & 0xff); - t ^= (in[i] >> 16) & 0xff; c = BOX (s_lotus_magic_table, t); out[i] ^= c << 16; t = ((out[i] >> 16) & 0xff); - t ^= (in[i] >> 24) & 0xff; c = BOX (s_lotus_magic_table, t); out[i] ^= c << 24; t = ((out[i] >> 24) & 0xff); + t ^= (in[i] >> 0) & 0xff; c = BOX1 (s_lotus_magic_table, t); out[i] ^= c << 0; t = ((out[i] >> 0) & 0xff); + t ^= (in[i] >> 8) & 0xff; c = BOX1 (s_lotus_magic_table, t); out[i] ^= c << 8; t = ((out[i] >> 8) & 0xff); + t ^= (in[i] >> 16) & 0xff; c = BOX1 (s_lotus_magic_table, t); out[i] ^= c << 16; t = ((out[i] >> 16) & 0xff); + t ^= (in[i] >> 24) & 0xff; c = BOX1 (s_lotus_magic_table, t); out[i] ^= c << 24; t = ((out[i] >> 24) & 0xff); } } @@ -177,9 +185,9 @@ static void pad (u32 w[4], const u32 len) } } -static void mdtransform_norecalc (u32 state[4], u32 block[4], __local u32 s_lotus_magic_table[256]) +static void mdtransform_norecalc (u32x state[4], u32x block[4], __local u32 s_lotus_magic_table[256]) { - u32 x[12]; + u32x x[12]; x[ 0] = state[0]; x[ 1] = state[1]; @@ -202,23 +210,23 @@ static void mdtransform_norecalc (u32 state[4], u32 block[4], __local u32 s_lotu state[3] = x[3]; } -static void mdtransform (u32 state[4], u32 checksum[4], u32 block[4], __local u32 s_lotus_magic_table[256]) +static void mdtransform (u32x state[4], u32x checksum[4], u32x block[4], __local u32 s_lotus_magic_table[256]) { mdtransform_norecalc (state, block, s_lotus_magic_table); lotus_transform_password (block, checksum, s_lotus_magic_table); } -static void domino_big_md (const u32 saved_key[16], const u32 size, u32 state[4], __local u32 s_lotus_magic_table[256]) +static void domino_big_md (const u32x saved_key[16], const u32x size, u32x state[4], __local u32 s_lotus_magic_table[256]) { - u32 checksum[4]; + u32x checksum[4]; checksum[0] = 0; checksum[1] = 0; checksum[2] = 0; checksum[3] = 0; - u32 block[4]; + u32x block[4]; block[0] = saved_key[0]; block[1] = saved_key[1]; @@ -230,7 +238,7 @@ static void domino_big_md (const u32 saved_key[16], const u32 size, u32 state[4] mdtransform_norecalc (state, checksum, s_lotus_magic_table); } -static void m08600m (__local u32 s_lotus_magic_table[256], u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m08600m (__local u32 s_lotus_magic_table[256], u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -266,13 +274,13 @@ static void m08600m (__local u32 s_lotus_magic_table[256], u32 w[16], const u32 u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - u32 w_tmp[16]; + u32x w_tmp[16]; w_tmp[ 0] = w0; w_tmp[ 1] = w[ 1]; @@ -291,7 +299,7 @@ static void m08600m (__local u32 s_lotus_magic_table[256], u32 w[16], const u32 w_tmp[14] = w[14]; w_tmp[15] = w[15]; - u32 state[4]; + u32x state[4]; state[0] = 0; state[1] = 0; @@ -300,16 +308,11 @@ static void m08600m (__local u32 s_lotus_magic_table[256], u32 w[16], const u32 domino_big_md (w_tmp, pw_len, state, s_lotus_magic_table); - const u32 r0 = state[0]; - const u32 r1 = state[1]; - const u32 r2 = state[2]; - const u32 r3 = state[3]; - - #include COMPARE_M + COMPARE_M_SIMD (state[0], state[1], state[2], state[3]); } } -static void m08600s (__local u32 s_lotus_magic_table[256], u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m08600s (__local u32 s_lotus_magic_table[256], u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -357,13 +360,13 @@ static void m08600s (__local u32 s_lotus_magic_table[256], u32 w[16], const u32 u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - u32 w_tmp[16]; + u32x w_tmp[16]; w_tmp[ 0] = w0; w_tmp[ 1] = w[ 1]; @@ -382,7 +385,7 @@ static void m08600s (__local u32 s_lotus_magic_table[256], u32 w[16], const u32 w_tmp[14] = w[14]; w_tmp[15] = w[15]; - u32 state[4]; + u32x state[4]; state[0] = 0; state[1] = 0; @@ -391,16 +394,11 @@ static void m08600s (__local u32 s_lotus_magic_table[256], u32 w[16], const u32 domino_big_md (w_tmp, pw_len, state, s_lotus_magic_table); - const u32 r0 = state[0]; - const u32 r1 = state[1]; - const u32 r2 = state[2]; - const u32 r3 = state[3]; - - #include COMPARE_S + COMPARE_S_SIMD (state[0], state[1], state[2], state[3]); } } -__kernel void m08600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -457,7 +455,7 @@ __kernel void m08600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m08600m (s_lotus_magic_table, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m08600_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08600_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -514,7 +512,7 @@ __kernel void m08600_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m08600m (s_lotus_magic_table, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m08600_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08600_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -571,7 +569,7 @@ __kernel void m08600_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m08600m (s_lotus_magic_table, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m08600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -628,7 +626,7 @@ __kernel void m08600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m08600s (s_lotus_magic_table, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m08600_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08600_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -685,7 +683,7 @@ __kernel void m08600_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m08600s (s_lotus_magic_table, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m08600_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08600_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base diff --git a/OpenCL/m08700_a1.cl b/OpenCL/m08700_a1.cl index 7e67c9793..65a21987b 100644 --- a/OpenCL/m08700_a1.cl +++ b/OpenCL/m08700_a1.cl @@ -322,7 +322,7 @@ __kernel void m08700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -372,7 +372,7 @@ __kernel void m08700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w[16]; @@ -561,7 +561,7 @@ __kernel void m08700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -623,7 +623,7 @@ __kernel void m08700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w[16]; diff --git a/OpenCL/m08700_a3.cl b/OpenCL/m08700_a3.cl index 586d3455f..9e6b5fec5 100644 --- a/OpenCL/m08700_a3.cl +++ b/OpenCL/m08700_a3.cl @@ -5,6 +5,8 @@ #define _LOTUS6_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,9 +18,7 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" - -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" +#include "OpenCL/simd.c" __constant u32 lotus_magic_table[256] = { @@ -58,45 +58,63 @@ __constant u32 lotus_magic_table[256] = #define BOX(S,i) (S)[(i)] -#define uint_to_hex_upper8(i) l_bin2asc[(i)] +#if VECT_SIZE == 1 +#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#endif -static void lotus_mix (u32 *in, __local u32 s_lotus_magic_table[256]) +#if VECT_SIZE == 1 +#define BOX1(S,i) (S)[(i)] +#elif VECT_SIZE == 2 +#define BOX1(S,i) (u32x) ((S)[(i).s0], (S)[(i).s1]) +#elif VECT_SIZE == 4 +#define BOX1(S,i) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3]) +#elif VECT_SIZE == 8 +#define BOX1(S,i) (u32x) ((S)[(i).s0], (S)[(i).s1], (S)[(i).s2], (S)[(i).s3], (S)[(i).s4], (S)[(i).s5], (S)[(i).s6], (S)[(i).s7]) +#endif + +static void lotus_mix (u32x *in, __local u32 s_lotus_magic_table[256]) { - u32 p = 0; + u32x p = 0; for (int i = 0; i < 18; i++) { u32 s = 48; - #pragma unroll + #pragma unroll 12 for (int j = 0; j < 12; j++) { - u32 tmp_in = in[j]; - u32 tmp_out = 0; + u32x tmp_in = in[j]; + u32x tmp_out = 0; - p = (p + s--) & 0xff; p = ((tmp_in >> 0) & 0xff) ^ BOX (s_lotus_magic_table, p); tmp_out |= p << 0; - p = (p + s--) & 0xff; p = ((tmp_in >> 8) & 0xff) ^ BOX (s_lotus_magic_table, p); tmp_out |= p << 8; - p = (p + s--) & 0xff; p = ((tmp_in >> 16) & 0xff) ^ BOX (s_lotus_magic_table, p); tmp_out |= p << 16; - p = (p + s--) & 0xff; p = ((tmp_in >> 24) & 0xff) ^ BOX (s_lotus_magic_table, p); tmp_out |= p << 24; + p = (p + s--) & 0xff; p = ((tmp_in >> 0) & 0xff) ^ BOX1 (s_lotus_magic_table, p); tmp_out |= p << 0; + p = (p + s--) & 0xff; p = ((tmp_in >> 8) & 0xff) ^ BOX1 (s_lotus_magic_table, p); tmp_out |= p << 8; + p = (p + s--) & 0xff; p = ((tmp_in >> 16) & 0xff) ^ BOX1 (s_lotus_magic_table, p); tmp_out |= p << 16; + p = (p + s--) & 0xff; p = ((tmp_in >> 24) & 0xff) ^ BOX1 (s_lotus_magic_table, p); tmp_out |= p << 24; in[j] = tmp_out; } } } -static void lotus_transform_password (u32 in[4], u32 out[4], __local u32 s_lotus_magic_table[256]) +static void lotus_transform_password (u32x in[4], u32x out[4], __local u32 s_lotus_magic_table[256]) { - u32 t = out[3] >> 24; + u32x t = out[3] >> 24; - u32 c; + u32x c; - //#pragma unroll // kernel fails if used + #pragma unroll 4 for (int i = 0; i < 4; i++) { - t ^= (in[i] >> 0) & 0xff; c = BOX (s_lotus_magic_table, t); out[i] ^= c << 0; t = ((out[i] >> 0) & 0xff); - t ^= (in[i] >> 8) & 0xff; c = BOX (s_lotus_magic_table, t); out[i] ^= c << 8; t = ((out[i] >> 8) & 0xff); - t ^= (in[i] >> 16) & 0xff; c = BOX (s_lotus_magic_table, t); out[i] ^= c << 16; t = ((out[i] >> 16) & 0xff); - t ^= (in[i] >> 24) & 0xff; c = BOX (s_lotus_magic_table, t); out[i] ^= c << 24; t = ((out[i] >> 24) & 0xff); + t ^= (in[i] >> 0) & 0xff; c = BOX1 (s_lotus_magic_table, t); out[i] ^= c << 0; t = ((out[i] >> 0) & 0xff); + t ^= (in[i] >> 8) & 0xff; c = BOX1 (s_lotus_magic_table, t); out[i] ^= c << 8; t = ((out[i] >> 8) & 0xff); + t ^= (in[i] >> 16) & 0xff; c = BOX1 (s_lotus_magic_table, t); out[i] ^= c << 16; t = ((out[i] >> 16) & 0xff); + t ^= (in[i] >> 24) & 0xff; c = BOX1 (s_lotus_magic_table, t); out[i] ^= c << 24; t = ((out[i] >> 24) & 0xff); } } @@ -179,9 +197,9 @@ static void pad (u32 w[4], const u32 len) } } -static void mdtransform_norecalc (u32 state[4], u32 block[4], __local u32 s_lotus_magic_table[256]) +static void mdtransform_norecalc (u32x state[4], u32x block[4], __local u32 s_lotus_magic_table[256]) { - u32 x[12]; + u32x x[12]; x[ 0] = state[0]; x[ 1] = state[1]; @@ -204,23 +222,23 @@ static void mdtransform_norecalc (u32 state[4], u32 block[4], __local u32 s_lotu state[3] = x[3]; } -static void mdtransform (u32 state[4], u32 checksum[4], u32 block[4], __local u32 s_lotus_magic_table[256]) +static void mdtransform (u32x state[4], u32x checksum[4], u32x block[4], __local u32 s_lotus_magic_table[256]) { mdtransform_norecalc (state, block, s_lotus_magic_table); lotus_transform_password (block, checksum, s_lotus_magic_table); } -static void domino_big_md (const u32 saved_key[16], const u32 size, u32 state[4], __local u32 s_lotus_magic_table[256]) +static void domino_big_md (const u32x saved_key[16], const u32 size, u32x state[4], __local u32 s_lotus_magic_table[256]) { - u32 checksum[4]; + u32x checksum[4]; checksum[0] = 0; checksum[1] = 0; checksum[2] = 0; checksum[3] = 0; - u32 block[4]; + u32x block[4]; block[0] = 0; block[1] = 0; @@ -250,7 +268,7 @@ static void domino_big_md (const u32 saved_key[16], const u32 size, u32 state[4] mdtransform_norecalc (state, checksum, s_lotus_magic_table); } -static void m08700m (__local u32 s_lotus_magic_table[256], __local u32 l_bin2asc[256], u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m08700m (__local u32 s_lotus_magic_table[256], __local u32 l_bin2asc[256], u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -293,13 +311,13 @@ static void m08700m (__local u32 s_lotus_magic_table[256], __local u32 l_bin2asc u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - u32 w_tmp[16]; + u32x w_tmp[16]; w_tmp[ 0] = w0; w_tmp[ 1] = w[ 1]; @@ -318,7 +336,7 @@ static void m08700m (__local u32 s_lotus_magic_table[256], __local u32 l_bin2asc w_tmp[14] = w[14]; w_tmp[15] = w[15]; - u32 state[4]; + u32x state[4]; state[0] = 0; state[1] = 0; @@ -327,24 +345,24 @@ static void m08700m (__local u32 s_lotus_magic_table[256], __local u32 l_bin2asc domino_big_md (w_tmp, pw_len, state, s_lotus_magic_table); - const u32 w0_t = uint_to_hex_upper8 ((state[0] >> 0) & 255) << 0 - | uint_to_hex_upper8 ((state[0] >> 8) & 255) << 16; - const u32 w1_t = uint_to_hex_upper8 ((state[0] >> 16) & 255) << 0 - | uint_to_hex_upper8 ((state[0] >> 24) & 255) << 16; - const u32 w2_t = uint_to_hex_upper8 ((state[1] >> 0) & 255) << 0 - | uint_to_hex_upper8 ((state[1] >> 8) & 255) << 16; - const u32 w3_t = uint_to_hex_upper8 ((state[1] >> 16) & 255) << 0 - | uint_to_hex_upper8 ((state[1] >> 24) & 255) << 16; - const u32 w4_t = uint_to_hex_upper8 ((state[2] >> 0) & 255) << 0 - | uint_to_hex_upper8 ((state[2] >> 8) & 255) << 16; - const u32 w5_t = uint_to_hex_upper8 ((state[2] >> 16) & 255) << 0 - | uint_to_hex_upper8 ((state[2] >> 24) & 255) << 16; - const u32 w6_t = uint_to_hex_upper8 ((state[3] >> 0) & 255) << 0 - | uint_to_hex_upper8 ((state[3] >> 8) & 255) << 16; - //const u32 w7_t = uint_to_hex_upper8 ((state[3] >> 16) & 255) << 0 - // | uint_to_hex_upper8 ((state[3] >> 24) & 255) << 16; + const u32x w0_t = uint_to_hex_upper8 ((state[0] >> 0) & 255) << 0 + | uint_to_hex_upper8 ((state[0] >> 8) & 255) << 16; + const u32x w1_t = uint_to_hex_upper8 ((state[0] >> 16) & 255) << 0 + | uint_to_hex_upper8 ((state[0] >> 24) & 255) << 16; + const u32x w2_t = uint_to_hex_upper8 ((state[1] >> 0) & 255) << 0 + | uint_to_hex_upper8 ((state[1] >> 8) & 255) << 16; + const u32x w3_t = uint_to_hex_upper8 ((state[1] >> 16) & 255) << 0 + | uint_to_hex_upper8 ((state[1] >> 24) & 255) << 16; + const u32x w4_t = uint_to_hex_upper8 ((state[2] >> 0) & 255) << 0 + | uint_to_hex_upper8 ((state[2] >> 8) & 255) << 16; + const u32x w5_t = uint_to_hex_upper8 ((state[2] >> 16) & 255) << 0 + | uint_to_hex_upper8 ((state[2] >> 24) & 255) << 16; + const u32x w6_t = uint_to_hex_upper8 ((state[3] >> 0) & 255) << 0 + | uint_to_hex_upper8 ((state[3] >> 8) & 255) << 16; + //const u32x w7_t = uint_to_hex_upper8 ((state[3] >> 16) & 255) << 0 + // | uint_to_hex_upper8 ((state[3] >> 24) & 255) << 16; - const u32 pade = 0x0e0e0e0e; + const u32x pade = 0x0e0e0e0e; w_tmp[ 0] = salt0; w_tmp[ 1] = salt1 | w0_t << 16; @@ -370,21 +388,16 @@ static void m08700m (__local u32 s_lotus_magic_table[256], __local u32 l_bin2asc domino_big_md (w_tmp, 34, state, s_lotus_magic_table); - u32 a = state[0] & 0xffffffff; - u32 b = state[1] & 0xffffffff; - u32 c = state[2] & 0x000000ff; - u32 d = state[3] & 0x00000000; + u32x a = state[0] & 0xffffffff; + u32x b = state[1] & 0xffffffff; + u32x c = state[2] & 0x000000ff; + u32x d = state[3] & 0x00000000; - const u32 r0 = a; - const u32 r1 = b; - const u32 r2 = c; - const u32 r3 = d; - - #include COMPARE_M + COMPARE_M_SIMD (a, b, c, d); } } -static void m08700s (__local u32 s_lotus_magic_table[256], __local u32 l_bin2asc[256], u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m08700s (__local u32 s_lotus_magic_table[256], __local u32 l_bin2asc[256], u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -439,13 +452,13 @@ static void m08700s (__local u32 s_lotus_magic_table[256], __local u32 l_bin2asc u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - u32 w_tmp[16]; + u32x w_tmp[16]; w_tmp[ 0] = w0; w_tmp[ 1] = w[ 1]; @@ -464,7 +477,7 @@ static void m08700s (__local u32 s_lotus_magic_table[256], __local u32 l_bin2asc w_tmp[14] = w[14]; w_tmp[15] = w[15]; - u32 state[4]; + u32x state[4]; state[0] = 0; state[1] = 0; @@ -473,24 +486,24 @@ static void m08700s (__local u32 s_lotus_magic_table[256], __local u32 l_bin2asc domino_big_md (w_tmp, pw_len, state, s_lotus_magic_table); - const u32 w0_t = uint_to_hex_upper8 ((state[0] >> 0) & 255) << 0 - | uint_to_hex_upper8 ((state[0] >> 8) & 255) << 16; - const u32 w1_t = uint_to_hex_upper8 ((state[0] >> 16) & 255) << 0 - | uint_to_hex_upper8 ((state[0] >> 24) & 255) << 16; - const u32 w2_t = uint_to_hex_upper8 ((state[1] >> 0) & 255) << 0 - | uint_to_hex_upper8 ((state[1] >> 8) & 255) << 16; - const u32 w3_t = uint_to_hex_upper8 ((state[1] >> 16) & 255) << 0 - | uint_to_hex_upper8 ((state[1] >> 24) & 255) << 16; - const u32 w4_t = uint_to_hex_upper8 ((state[2] >> 0) & 255) << 0 - | uint_to_hex_upper8 ((state[2] >> 8) & 255) << 16; - const u32 w5_t = uint_to_hex_upper8 ((state[2] >> 16) & 255) << 0 - | uint_to_hex_upper8 ((state[2] >> 24) & 255) << 16; - const u32 w6_t = uint_to_hex_upper8 ((state[3] >> 0) & 255) << 0 - | uint_to_hex_upper8 ((state[3] >> 8) & 255) << 16; - //const u32 w7_t = uint_to_hex_upper8 ((state[3] >> 16) & 255) << 0 - // | uint_to_hex_upper8 ((state[3] >> 24) & 255) << 16; + const u32x w0_t = uint_to_hex_upper8 ((state[0] >> 0) & 255) << 0 + | uint_to_hex_upper8 ((state[0] >> 8) & 255) << 16; + const u32x w1_t = uint_to_hex_upper8 ((state[0] >> 16) & 255) << 0 + | uint_to_hex_upper8 ((state[0] >> 24) & 255) << 16; + const u32x w2_t = uint_to_hex_upper8 ((state[1] >> 0) & 255) << 0 + | uint_to_hex_upper8 ((state[1] >> 8) & 255) << 16; + const u32x w3_t = uint_to_hex_upper8 ((state[1] >> 16) & 255) << 0 + | uint_to_hex_upper8 ((state[1] >> 24) & 255) << 16; + const u32x w4_t = uint_to_hex_upper8 ((state[2] >> 0) & 255) << 0 + | uint_to_hex_upper8 ((state[2] >> 8) & 255) << 16; + const u32x w5_t = uint_to_hex_upper8 ((state[2] >> 16) & 255) << 0 + | uint_to_hex_upper8 ((state[2] >> 24) & 255) << 16; + const u32x w6_t = uint_to_hex_upper8 ((state[3] >> 0) & 255) << 0 + | uint_to_hex_upper8 ((state[3] >> 8) & 255) << 16; + //const u32x w7_t = uint_to_hex_upper8 ((state[3] >> 16) & 255) << 0 + // | uint_to_hex_upper8 ((state[3] >> 24) & 255) << 16; - const u32 pade = 0x0e0e0e0e; + const u32x pade = 0x0e0e0e0e; w_tmp[ 0] = salt0; w_tmp[ 1] = salt1 | w0_t << 16; @@ -516,21 +529,16 @@ static void m08700s (__local u32 s_lotus_magic_table[256], __local u32 l_bin2asc domino_big_md (w_tmp, 34, state, s_lotus_magic_table); - u32 a = state[0] & 0xffffffff; - u32 b = state[1] & 0xffffffff; - u32 c = state[2] & 0x000000ff; - u32 d = state[3] & 0x00000000; + u32x a = state[0] & 0xffffffff; + u32x b = state[1] & 0xffffffff; + u32x c = state[2] & 0x000000ff; + u32x d = state[3] & 0x00000000; - const u32 r0 = a; - const u32 r1 = b; - const u32 r2 = c; - const u32 r3 = d; - - #include COMPARE_S + COMPARE_S_SIMD (a, b, c, d); } } -__kernel void m08700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -598,7 +606,7 @@ __kernel void m08700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m08700m (s_lotus_magic_table, l_bin2asc, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m08700_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08700_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -666,7 +674,7 @@ __kernel void m08700_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m08700m (s_lotus_magic_table, l_bin2asc, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m08700_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08700_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -734,7 +742,7 @@ __kernel void m08700_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m08700m (s_lotus_magic_table, l_bin2asc, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m08700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -802,7 +810,7 @@ __kernel void m08700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m08700s (s_lotus_magic_table, l_bin2asc, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m08700_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08700_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -870,7 +878,7 @@ __kernel void m08700_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m08700s (s_lotus_magic_table, l_bin2asc, w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m08700_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m08700_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base diff --git a/OpenCL/m09700_a1.cl b/OpenCL/m09700_a1.cl index f3abd0029..013755910 100644 --- a/OpenCL/m09700_a1.cl +++ b/OpenCL/m09700_a1.cl @@ -633,7 +633,7 @@ __kernel void m09700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -712,7 +712,7 @@ __kernel void m09700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -914,7 +914,7 @@ __kernel void m09700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -993,7 +993,7 @@ __kernel void m09700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m09700_a3.cl b/OpenCL/m09700_a3.cl index 27dae55fd..c84cfe02e 100644 --- a/OpenCL/m09700_a3.cl +++ b/OpenCL/m09700_a3.cl @@ -5,6 +5,8 @@ #define _OLDOFFICE01_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,9 +18,7 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" - -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" +#include "OpenCL/simd.c" typedef struct { @@ -37,10 +37,10 @@ static void swap (__local RC4_KEY *rc4_key, const u8 i, const u8 j) rc4_key->S[j] = tmp; } -static void rc4_init_16 (__local RC4_KEY *rc4_key, const u32 data[4]) +static void rc4_init_16 (__local RC4_KEY *rc4_key, const u32x data[4]) { - u32 v = 0x03020100; - u32 a = 0x04040404; + u32x v = 0x03020100; + u32x a = 0x04040404; __local u32 *ptr = (__local u32 *) rc4_key->S; @@ -88,7 +88,7 @@ static void rc4_init_16 (__local RC4_KEY *rc4_key, const u32 data[4]) } } -static u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4]) +static u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32x in[4], u32x out[4]) { #pragma unroll for (u32 k = 0; k < 4; k++) @@ -139,29 +139,29 @@ static u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u3 return j; } -static void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) +static void md5_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[4]) { - u32 a = digest[0]; - u32 b = digest[1]; - u32 c = digest[2]; - u32 d = digest[3]; + u32x a = digest[0]; + u32x b = digest[1]; + u32x c = digest[2]; + u32x d = digest[3]; - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; + u32x w0_t = w0[0]; + u32x w1_t = w0[1]; + u32x w2_t = w0[2]; + u32x w3_t = w0[3]; + u32x w4_t = w1[0]; + u32x w5_t = w1[1]; + u32x w6_t = w1[2]; + u32x w7_t = w1[3]; + u32x w8_t = w2[0]; + u32x w9_t = w2[1]; + u32x wa_t = w2[2]; + u32x wb_t = w2[3]; + u32x wc_t = w3[0]; + u32x wd_t = w3[1]; + u32x we_t = w3[2]; + u32x wf_t = w3[3]; MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); @@ -301,18 +301,18 @@ static void m09700m (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - u32 w0_t[4]; - u32 w1_t[4]; - u32 w2_t[4]; - u32 w3_t[4]; + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; - w0_t[0] = w0[0]; + w0_t[0] = w0lr; w0_t[1] = w0[1]; w0_t[2] = w0[2]; w0_t[3] = w0[3]; @@ -329,10 +329,10 @@ static void m09700m (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ w3_t[2] = pw_len * 8; w3_t[3] = 0; - u32 digest_t0[4]; - u32 digest_t1[2]; // need only first 5 byte - u32 digest_t2[2]; - u32 digest_t3[2]; + u32x digest_t0[4]; + u32x digest_t1[2]; // need only first 5 byte + u32x digest_t2[2]; + u32x digest_t3[2]; digest_t0[0] = MD5M_A; digest_t0[1] = MD5M_B; @@ -343,7 +343,7 @@ static void m09700m (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ // prepare 16 * 21 buffer stuff - u32 digest[4]; + u32x digest[4]; digest[0] = MD5M_A; digest[1] = MD5M_B; @@ -685,7 +685,7 @@ static void m09700m (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ // now the RC4 part - u32 key[4]; + u32x key[4]; key[0] = digest[0]; key[1] = digest[1]; @@ -694,7 +694,7 @@ static void m09700m (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ rc4_init_16 (rc4_key, key); - u32 out[4]; + u32x out[4]; u8 j = rc4_next_16 (rc4_key, 0, 0, encryptedVerifier, out); @@ -724,12 +724,7 @@ static void m09700m (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ rc4_next_16 (rc4_key, 16, j, digest, out); - const u32 r0 = out[0]; - const u32 r1 = out[1]; - const u32 r2 = out[2]; - const u32 r3 = out[3]; - - #include COMPARE_M + COMPARE_M_SIMD (out[0], out[1], out[2], out[3]); } } @@ -809,18 +804,18 @@ static void m09700s (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - u32 w0_t[4]; - u32 w1_t[4]; - u32 w2_t[4]; - u32 w3_t[4]; + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; - w0_t[0] = w0[0]; + w0_t[0] = w0lr; w0_t[1] = w0[1]; w0_t[2] = w0[2]; w0_t[3] = w0[3]; @@ -837,10 +832,10 @@ static void m09700s (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ w3_t[2] = pw_len * 8; w3_t[3] = 0; - u32 digest_t0[4]; - u32 digest_t1[2]; // need only first 5 byte - u32 digest_t2[2]; - u32 digest_t3[2]; + u32x digest_t0[4]; + u32x digest_t1[2]; // need only first 5 byte + u32x digest_t2[2]; + u32x digest_t3[2]; digest_t0[0] = MD5M_A; digest_t0[1] = MD5M_B; @@ -1193,7 +1188,7 @@ static void m09700s (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ // now the RC4 part - u32 key[4]; + u32x key[4]; key[0] = digest[0]; key[1] = digest[1]; @@ -1202,7 +1197,7 @@ static void m09700s (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ rc4_init_16 (rc4_key, key); - u32 out[4]; + u32x out[4]; u8 j = rc4_next_16 (rc4_key, 0, 0, encryptedVerifier, out); @@ -1232,12 +1227,7 @@ static void m09700s (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ rc4_next_16 (rc4_key, 16, j, digest, out); - const u32 r0 = out[0]; - const u32 r1 = out[1]; - const u32 r2 = out[2]; - const u32 r3 = out[3]; - - #include COMPARE_S + COMPARE_S_SIMD (out[0], out[1], out[2], out[3]); } } diff --git a/OpenCL/m09710_a1.cl b/OpenCL/m09710_a1.cl index 36a79b8c0..5e66a072a 100644 --- a/OpenCL/m09710_a1.cl +++ b/OpenCL/m09710_a1.cl @@ -289,7 +289,7 @@ __kernel void m09710_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -345,7 +345,7 @@ __kernel void m09710_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } // first md5 to generate RC4 128 bit key @@ -491,7 +491,7 @@ __kernel void m09710_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -559,7 +559,7 @@ __kernel void m09710_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } // first md5 to generate RC4 128 bit key diff --git a/OpenCL/m09710_a3.cl b/OpenCL/m09710_a3.cl index ad28a6286..76cfef13d 100644 --- a/OpenCL/m09710_a3.cl +++ b/OpenCL/m09710_a3.cl @@ -5,6 +5,8 @@ #define _OLDOFFICE01_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,9 +18,7 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" - -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" +#include "OpenCL/simd.c" typedef struct { @@ -37,10 +37,10 @@ static void swap (__local RC4_KEY *rc4_key, const u8 i, const u8 j) rc4_key->S[j] = tmp; } -static void rc4_init_16 (__local RC4_KEY *rc4_key, const u32 data[4]) +static void rc4_init_16 (__local RC4_KEY *rc4_key, const u32x data[4]) { - u32 v = 0x03020100; - u32 a = 0x04040404; + u32x v = 0x03020100; + u32x a = 0x04040404; __local u32 *ptr = (__local u32 *) rc4_key->S; @@ -88,7 +88,7 @@ static void rc4_init_16 (__local RC4_KEY *rc4_key, const u32 data[4]) } } -static u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4]) +static u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32x in[4], u32x out[4]) { #pragma unroll for (u32 k = 0; k < 4; k++) @@ -139,29 +139,29 @@ static u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u3 return j; } -static void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) +static void md5_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[4]) { - u32 a = digest[0]; - u32 b = digest[1]; - u32 c = digest[2]; - u32 d = digest[3]; + u32x a = digest[0]; + u32x b = digest[1]; + u32x c = digest[2]; + u32x d = digest[3]; - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; + u32x w0_t = w0[0]; + u32x w1_t = w0[1]; + u32x w2_t = w0[2]; + u32x w3_t = w0[3]; + u32x w4_t = w1[0]; + u32x w5_t = w1[1]; + u32x w6_t = w1[2]; + u32x w7_t = w1[3]; + u32x w8_t = w2[0]; + u32x w9_t = w2[1]; + u32x wa_t = w2[2]; + u32x wb_t = w2[3]; + u32x wc_t = w3[0]; + u32x wd_t = w3[1]; + u32x we_t = w3[2]; + u32x wf_t = w3[3]; MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); @@ -267,20 +267,20 @@ static void m09710m (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; // first md5 to generate RC4 128 bit key - u32 w0_t[4]; - u32 w1_t[4]; - u32 w2_t[4]; - u32 w3_t[4]; + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; - w0_t[0] = w0[0]; + w0_t[0] = w0lr; w0_t[1] = w0[1] & 0xff; w0_t[2] = 0x8000; w0_t[3] = 0; @@ -297,7 +297,7 @@ static void m09710m (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ w3_t[2] = 9 * 8; w3_t[3] = 0; - u32 digest[4]; + u32x digest[4]; digest[0] = MD5M_A; digest[1] = MD5M_B; @@ -308,7 +308,7 @@ static void m09710m (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ // now the RC4 part - u32 key[4]; + u32x key[4]; key[0] = digest[0]; key[1] = digest[1]; @@ -317,7 +317,7 @@ static void m09710m (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ rc4_init_16 (rc4_key, key); - u32 out[4]; + u32x out[4]; u8 j = rc4_next_16 (rc4_key, 0, 0, encryptedVerifier, out); @@ -347,12 +347,7 @@ static void m09710m (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ rc4_next_16 (rc4_key, 16, j, digest, out); - const u32 r0 = out[0]; - const u32 r1 = out[1]; - const u32 r2 = out[2]; - const u32 r3 = out[3]; - - #include COMPARE_M + COMPARE_M_SIMD (out[0], out[1], out[2], out[3]); } } @@ -398,20 +393,20 @@ static void m09710s (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; // first md5 to generate RC4 128 bit key - u32 w0_t[4]; - u32 w1_t[4]; - u32 w2_t[4]; - u32 w3_t[4]; + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; - w0_t[0] = w0[0]; + w0_t[0] = w0lr; w0_t[1] = w0[1] & 0xff; w0_t[2] = 0x8000; w0_t[3] = 0; @@ -428,7 +423,7 @@ static void m09710s (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ w3_t[2] = 9 * 8; w3_t[3] = 0; - u32 digest[4]; + u32x digest[4]; digest[0] = MD5M_A; digest[1] = MD5M_B; @@ -439,7 +434,7 @@ static void m09710s (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ // now the RC4 part - u32 key[4]; + u32x key[4]; key[0] = digest[0]; key[1] = digest[1]; @@ -448,7 +443,7 @@ static void m09710s (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ rc4_init_16 (rc4_key, key); - u32 out[4]; + u32x out[4]; u8 j = rc4_next_16 (rc4_key, 0, 0, encryptedVerifier, out); @@ -478,12 +473,7 @@ static void m09710s (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ rc4_next_16 (rc4_key, 16, j, digest, out); - const u32 r0 = out[0]; - const u32 r1 = out[1]; - const u32 r2 = out[2]; - const u32 r3 = out[3]; - - #include COMPARE_S + COMPARE_S_SIMD (out[0], out[1], out[2], out[3]); } } diff --git a/OpenCL/m09720_a1.cl b/OpenCL/m09720_a1.cl index 3746e553c..fdff5b906 100644 --- a/OpenCL/m09720_a1.cl +++ b/OpenCL/m09720_a1.cl @@ -510,7 +510,7 @@ __kernel void m09720_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -564,7 +564,7 @@ __kernel void m09720_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -698,7 +698,7 @@ __kernel void m09720_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -764,7 +764,7 @@ __kernel void m09720_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m09720_a3.cl b/OpenCL/m09720_a3.cl index a3f170179..788214ff0 100644 --- a/OpenCL/m09720_a3.cl +++ b/OpenCL/m09720_a3.cl @@ -5,6 +5,8 @@ #define _OLDOFFICE01_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,33 +18,31 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" +#include "OpenCL/simd.c" -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" - -static void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) +static void md5_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[4]) { - u32 a = digest[0]; - u32 b = digest[1]; - u32 c = digest[2]; - u32 d = digest[3]; + u32x a = digest[0]; + u32x b = digest[1]; + u32x c = digest[2]; + u32x d = digest[3]; - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; + u32x w0_t = w0[0]; + u32x w1_t = w0[1]; + u32x w2_t = w0[2]; + u32x w3_t = w0[3]; + u32x w4_t = w1[0]; + u32x w5_t = w1[1]; + u32x w6_t = w1[2]; + u32x w7_t = w1[3]; + u32x w8_t = w2[0]; + u32x w9_t = w2[1]; + u32x wa_t = w2[2]; + u32x wb_t = w2[3]; + u32x wc_t = w3[0]; + u32x wd_t = w3[1]; + u32x we_t = w3[2]; + u32x wf_t = w3[3]; MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); @@ -118,12 +118,12 @@ static void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], co digest[3] += d; } -static void gen336 (u32 digest_pre[4], u32 salt_buf[4], u32 digest[4]) +static void gen336 (u32x digest_pre[4], u32 salt_buf[4], u32x digest[4]) { - u32 digest_t0[2]; - u32 digest_t1[2]; - u32 digest_t2[2]; - u32 digest_t3[2]; + u32x digest_t0[2]; + u32x digest_t1[2]; + u32x digest_t2[2]; + u32x digest_t3[2]; digest_t0[0] = digest_pre[0]; digest_t0[1] = digest_pre[1] & 0xff; @@ -137,10 +137,10 @@ static void gen336 (u32 digest_pre[4], u32 salt_buf[4], u32 digest[4]) digest_t3[0] = digest_pre[0] << 24; digest_t3[1] = digest_pre[0] >> 8 | digest_pre[1] << 24; - u32 salt_buf_t0[4]; - u32 salt_buf_t1[5]; - u32 salt_buf_t2[5]; - u32 salt_buf_t3[5]; + u32x salt_buf_t0[4]; + u32x salt_buf_t1[5]; + u32x salt_buf_t2[5]; + u32x salt_buf_t3[5]; salt_buf_t0[0] = salt_buf[0]; salt_buf_t0[1] = salt_buf[1]; @@ -165,10 +165,10 @@ static void gen336 (u32 digest_pre[4], u32 salt_buf[4], u32 digest[4]) salt_buf_t3[3] = salt_buf[2] >> 8 | salt_buf[3] << 24; salt_buf_t3[4] = salt_buf[3] >> 8; - u32 w0_t[4]; - u32 w1_t[4]; - u32 w2_t[4]; - u32 w3_t[4]; + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; // generate the 16 * 21 buffer @@ -488,18 +488,18 @@ static void m09720m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - u32 w0_t[4]; - u32 w1_t[4]; - u32 w2_t[4]; - u32 w3_t[4]; + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; - w0_t[0] = w0[0]; + w0_t[0] = w0lr; w0_t[1] = w0[1]; w0_t[2] = w0[2]; w0_t[3] = w0[3]; @@ -516,7 +516,7 @@ static void m09720m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w3_t[2] = pw_len * 8; w3_t[3] = 0; - u32 digest_pre[4]; + u32x digest_pre[4]; digest_pre[0] = MD5M_A; digest_pre[1] = MD5M_B; @@ -530,7 +530,7 @@ static void m09720m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le digest_pre[2] &= 0x00000000; digest_pre[3] &= 0x00000000; - u32 digest[4]; + u32x digest[4]; digest[0] = MD5M_A; digest[1] = MD5M_B; @@ -539,15 +539,12 @@ static void m09720m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le gen336 (digest_pre, salt_buf, digest); - u32 a = digest[0]; - u32 b = digest[1] & 0xff; + u32x a = digest[0]; + u32x b = digest[1] & 0xff; + u32x c = 0; + u32x d = 0; - const u32 r0 = a; - const u32 r1 = b; - const u32 r2 = 0; - const u32 r3 = 0; - - #include COMPARE_M + COMPARE_M_SIMD (a, b, c, d); } } @@ -589,18 +586,18 @@ static void m09720s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - u32 w0_t[4]; - u32 w1_t[4]; - u32 w2_t[4]; - u32 w3_t[4]; + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; - w0_t[0] = w0[0]; + w0_t[0] = w0lr; w0_t[1] = w0[1]; w0_t[2] = w0[2]; w0_t[3] = w0[3]; @@ -617,7 +614,7 @@ static void m09720s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w3_t[2] = pw_len * 8; w3_t[3] = 0; - u32 digest_pre[4]; + u32x digest_pre[4]; digest_pre[0] = MD5M_A; digest_pre[1] = MD5M_B; @@ -631,7 +628,7 @@ static void m09720s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le digest_pre[2] &= 0x00000000; digest_pre[3] &= 0x00000000; - u32 digest[4]; + u32x digest[4]; digest[0] = MD5M_A; digest[1] = MD5M_B; @@ -640,15 +637,12 @@ static void m09720s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le gen336 (digest_pre, salt_buf, digest); - u32 a = digest[0]; - u32 b = digest[1] & 0xff; + u32x a = digest[0]; + u32x b = digest[1] & 0xff; + u32x c = 0; + u32x d = 0; - const u32 r0 = a; - const u32 r1 = b; - const u32 r2 = 0; - const u32 r3 = 0; - - #include COMPARE_S + COMPARE_S_SIMD (a, b, c, d); } } diff --git a/OpenCL/m09800_a0.cl b/OpenCL/m09800_a0.cl index 194a13b0a..adc5b75ff 100644 --- a/OpenCL/m09800_a0.cl +++ b/OpenCL/m09800_a0.cl @@ -379,7 +379,7 @@ __kernel void m09800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, make_unicode (w0, w0_t, w1_t); make_unicode (w1, w2_t, w3_t); - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w0_t[0] = salt_buf[0]; w0_t[1] = salt_buf[1]; @@ -624,7 +624,7 @@ __kernel void m09800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, make_unicode (w0, w0_t, w1_t); make_unicode (w1, w2_t, w3_t); - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w0_t[0] = salt_buf[0]; w0_t[1] = salt_buf[1]; diff --git a/OpenCL/m09800_a1.cl b/OpenCL/m09800_a1.cl index 713dae8b2..bf39291d1 100644 --- a/OpenCL/m09800_a1.cl +++ b/OpenCL/m09800_a1.cl @@ -319,7 +319,7 @@ __kernel void m09800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -390,7 +390,7 @@ __kernel void m09800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -431,7 +431,7 @@ __kernel void m09800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, make_unicode (w0, w0_t, w1_t); make_unicode (w1, w2_t, w3_t); - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w0_t[0] = salt_buf[0]; w0_t[1] = salt_buf[1]; @@ -606,7 +606,7 @@ __kernel void m09800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -689,7 +689,7 @@ __kernel void m09800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -730,7 +730,7 @@ __kernel void m09800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, make_unicode (w0, w0_t, w1_t); make_unicode (w1, w2_t, w3_t); - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w0_t[0] = salt_buf[0]; w0_t[1] = salt_buf[1]; diff --git a/OpenCL/m09800_a3.cl b/OpenCL/m09800_a3.cl index 4b543df9b..652756e41 100644 --- a/OpenCL/m09800_a3.cl +++ b/OpenCL/m09800_a3.cl @@ -5,6 +5,8 @@ #define _OLDOFFICE34_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,9 +18,7 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" - -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" +#include "OpenCL/simd.c" typedef struct { @@ -37,10 +37,10 @@ static void swap (__local RC4_KEY *rc4_key, const u8 i, const u8 j) rc4_key->S[j] = tmp; } -static void rc4_init_16 (__local RC4_KEY *rc4_key, const u32 data[4]) +static void rc4_init_16 (__local RC4_KEY *rc4_key, const u32x data[4]) { - u32 v = 0x03020100; - u32 a = 0x04040404; + u32x v = 0x03020100; + u32x a = 0x04040404; __local u32 *ptr = (__local u32 *) rc4_key->S; @@ -56,7 +56,7 @@ static void rc4_init_16 (__local RC4_KEY *rc4_key, const u32 data[4]) { u32 idx = i * 16; - u32 v; + u32x v; v = data[0]; @@ -88,12 +88,12 @@ static void rc4_init_16 (__local RC4_KEY *rc4_key, const u32 data[4]) } } -static u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4]) +static u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32x in[4], u32x out[4]) { #pragma unroll for (u32 k = 0; k < 4; k++) { - u32 xor4 = 0; + u32x xor4 = 0; u8 idx; @@ -139,30 +139,30 @@ static u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u3 return j; } -static void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) +static void sha1_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[5]) { - u32 A = digest[0]; - u32 B = digest[1]; - u32 C = digest[2]; - u32 D = digest[3]; - u32 E = digest[4]; + u32x A = digest[0]; + u32x B = digest[1]; + u32x C = digest[2]; + u32x D = digest[3]; + u32x E = digest[4]; - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; + u32x w0_t = w0[0]; + u32x w1_t = w0[1]; + u32x w2_t = w0[2]; + u32x w3_t = w0[3]; + u32x w4_t = w1[0]; + u32x w5_t = w1[1]; + u32x w6_t = w1[2]; + u32x w7_t = w1[3]; + u32x w8_t = w2[0]; + u32x w9_t = w2[1]; + u32x wa_t = w2[2]; + u32x wb_t = w2[3]; + u32x wc_t = w3[0]; + u32x wd_t = w3[1]; + u32x we_t = w3[2]; + u32x wf_t = w3[3]; #undef K #define K SHA1C00 @@ -312,22 +312,22 @@ static void m09800m (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - u32 w0_t[4]; - u32 w1_t[4]; - u32 w2_t[4]; - u32 w3_t[4]; + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; w0_t[0] = salt_buf[0]; w0_t[1] = salt_buf[1]; w0_t[2] = salt_buf[2]; w0_t[3] = salt_buf[3]; - w1_t[0] = w0[0]; + w1_t[0] = w0lr; w1_t[1] = w0[1]; w1_t[2] = w0[2]; w1_t[3] = w0[3]; @@ -340,7 +340,7 @@ static void m09800m (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ w3_t[2] = 0; w3_t[3] = pw_salt_len * 8; - u32 digest[5]; + u32x digest[5]; digest[0] = SHA1M_A; digest[1] = SHA1M_B; @@ -375,7 +375,7 @@ static void m09800m (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); - u32 key[4]; + u32x key[4]; key[0] = swap32 (digest[0]); key[1] = swap32 (digest[1]); @@ -391,7 +391,7 @@ static void m09800m (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ rc4_init_16 (rc4_key, key); - u32 out[4]; + u32x out[4]; u8 j = rc4_next_16 (rc4_key, 0, 0, encryptedVerifier, out); @@ -427,12 +427,7 @@ static void m09800m (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ rc4_next_16 (rc4_key, 16, j, digest, out); - const u32 r0 = out[0]; - const u32 r1 = out[1]; - const u32 r2 = out[2]; - const u32 r3 = out[3]; - - #include COMPARE_M + COMPARE_M_SIMD (out[0], out[1], out[2], out[3]); } } @@ -493,22 +488,22 @@ static void m09800s (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - u32 w0_t[4]; - u32 w1_t[4]; - u32 w2_t[4]; - u32 w3_t[4]; + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; w0_t[0] = salt_buf[0]; w0_t[1] = salt_buf[1]; w0_t[2] = salt_buf[2]; w0_t[3] = salt_buf[3]; - w1_t[0] = w0[0]; + w1_t[0] = w0lr; w1_t[1] = w0[1]; w1_t[2] = w0[2]; w1_t[3] = w0[3]; @@ -521,7 +516,7 @@ static void m09800s (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ w3_t[2] = 0; w3_t[3] = pw_salt_len * 8; - u32 digest[5]; + u32x digest[5]; digest[0] = SHA1M_A; digest[1] = SHA1M_B; @@ -556,7 +551,7 @@ static void m09800s (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); - u32 key[4]; + u32x key[4]; key[0] = swap32 (digest[0]); key[1] = swap32 (digest[1]); @@ -572,7 +567,7 @@ static void m09800s (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ rc4_init_16 (rc4_key, key); - u32 out[4]; + u32x out[4]; u8 j = rc4_next_16 (rc4_key, 0, 0, encryptedVerifier, out); @@ -608,12 +603,7 @@ static void m09800s (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ rc4_next_16 (rc4_key, 16, j, digest, out); - const u32 r0 = out[0]; - const u32 r1 = out[1]; - const u32 r2 = out[2]; - const u32 r3 = out[3]; - - #include COMPARE_S + COMPARE_S_SIMD (out[0], out[1], out[2], out[3]); } } diff --git a/OpenCL/m09810_a1.cl b/OpenCL/m09810_a1.cl index d49073602..617229310 100644 --- a/OpenCL/m09810_a1.cl +++ b/OpenCL/m09810_a1.cl @@ -319,7 +319,7 @@ __kernel void m09810_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -375,7 +375,7 @@ __kernel void m09810_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -501,7 +501,7 @@ __kernel void m09810_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -569,7 +569,7 @@ __kernel void m09810_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m09810_a3.cl b/OpenCL/m09810_a3.cl index 43aea475b..a6fd83f19 100644 --- a/OpenCL/m09810_a3.cl +++ b/OpenCL/m09810_a3.cl @@ -5,6 +5,8 @@ #define _OLDOFFICE34_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,9 +18,7 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" - -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" +#include "OpenCL/simd.c" typedef struct { @@ -37,10 +37,10 @@ static void swap (__local RC4_KEY *rc4_key, const u8 i, const u8 j) rc4_key->S[j] = tmp; } -static void rc4_init_16 (__local RC4_KEY *rc4_key, const u32 data[4]) +static void rc4_init_16 (__local RC4_KEY *rc4_key, const u32x data[4]) { - u32 v = 0x03020100; - u32 a = 0x04040404; + u32x v = 0x03020100; + u32x a = 0x04040404; __local u32 *ptr = (__local u32 *) rc4_key->S; @@ -56,7 +56,7 @@ static void rc4_init_16 (__local RC4_KEY *rc4_key, const u32 data[4]) { u32 idx = i * 16; - u32 v; + u32x v; v = data[0]; @@ -88,12 +88,12 @@ static void rc4_init_16 (__local RC4_KEY *rc4_key, const u32 data[4]) } } -static u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u32 out[4]) +static u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32x in[4], u32x out[4]) { #pragma unroll for (u32 k = 0; k < 4; k++) { - u32 xor4 = 0; + u32x xor4 = 0; u8 idx; @@ -139,30 +139,30 @@ static u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, const u32 in[4], u3 return j; } -static void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) +static void sha1_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[5]) { - u32 A = digest[0]; - u32 B = digest[1]; - u32 C = digest[2]; - u32 D = digest[3]; - u32 E = digest[4]; + u32x A = digest[0]; + u32x B = digest[1]; + u32x C = digest[2]; + u32x D = digest[3]; + u32x E = digest[4]; - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; + u32x w0_t = w0[0]; + u32x w1_t = w0[1]; + u32x w2_t = w0[2]; + u32x w3_t = w0[3]; + u32x w4_t = w1[0]; + u32x w5_t = w1[1]; + u32x w6_t = w1[2]; + u32x w7_t = w1[3]; + u32x w8_t = w2[0]; + u32x w9_t = w2[1]; + u32x wa_t = w2[2]; + u32x wb_t = w2[3]; + u32x wc_t = w3[0]; + u32x wd_t = w3[1]; + u32x we_t = w3[2]; + u32x wf_t = w3[3]; #undef K #define K SHA1C00 @@ -297,29 +297,29 @@ static void m09810m (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - u32 key[4]; + u32x key[4]; - key[0] = w0[0]; + key[0] = w0lr; key[1] = w0[1] & 0xff; key[2] = 0; key[3] = 0; rc4_init_16 (rc4_key, key); - u32 out[4]; + u32x out[4]; u8 j = rc4_next_16 (rc4_key, 0, 0, encryptedVerifier, out); - u32 w0_t[4]; - u32 w1_t[4]; - u32 w2_t[4]; - u32 w3_t[4]; + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; w0_t[0] = swap32 (out[0]); w0_t[1] = swap32 (out[1]); @@ -338,7 +338,7 @@ static void m09810m (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ w3_t[2] = 0; w3_t[3] = 16 * 8; - u32 digest[5]; + u32x digest[5]; digest[0] = SHA1M_A; digest[1] = SHA1M_B; @@ -355,12 +355,7 @@ static void m09810m (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ rc4_next_16 (rc4_key, 16, j, digest, out); - const u32 r0 = out[0]; - const u32 r1 = out[1]; - const u32 r2 = out[2]; - const u32 r3 = out[3]; - - #include COMPARE_M + COMPARE_M_SIMD (out[0], out[1], out[2], out[3]); } } @@ -406,29 +401,29 @@ static void m09810s (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - u32 key[4]; + u32x key[4]; - key[0] = w0[0]; + key[0] = w0lr; key[1] = w0[1] & 0xff; key[2] = 0; key[3] = 0; rc4_init_16 (rc4_key, key); - u32 out[4]; + u32x out[4]; u8 j = rc4_next_16 (rc4_key, 0, 0, encryptedVerifier, out); - u32 w0_t[4]; - u32 w1_t[4]; - u32 w2_t[4]; - u32 w3_t[4]; + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; w0_t[0] = swap32 (out[0]); w0_t[1] = swap32 (out[1]); @@ -447,7 +442,7 @@ static void m09810s (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ w3_t[2] = 0; w3_t[3] = 16 * 8; - u32 digest[5]; + u32x digest[5]; digest[0] = SHA1M_A; digest[1] = SHA1M_B; @@ -464,12 +459,7 @@ static void m09810s (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ rc4_next_16 (rc4_key, 16, j, digest, out); - const u32 r0 = out[0]; - const u32 r1 = out[1]; - const u32 r2 = out[2]; - const u32 r3 = out[3]; - - #include COMPARE_S + COMPARE_S_SIMD (out[0], out[1], out[2], out[3]); } } diff --git a/OpenCL/m09820_a0.cl b/OpenCL/m09820_a0.cl index 20bdb6f41..1883d1194 100644 --- a/OpenCL/m09820_a0.cl +++ b/OpenCL/m09820_a0.cl @@ -245,7 +245,7 @@ __kernel void m09820_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, make_unicode (w0, w0_t, w1_t); make_unicode (w1, w2_t, w3_t); - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w0_t[0] = salt_buf[0]; w0_t[1] = salt_buf[1]; @@ -426,7 +426,7 @@ __kernel void m09820_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, make_unicode (w0, w0_t, w1_t); make_unicode (w1, w2_t, w3_t); - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w0_t[0] = salt_buf[0]; w0_t[1] = salt_buf[1]; diff --git a/OpenCL/m09820_a1.cl b/OpenCL/m09820_a1.cl index c91484e73..73f62b9fb 100644 --- a/OpenCL/m09820_a1.cl +++ b/OpenCL/m09820_a1.cl @@ -196,7 +196,7 @@ __kernel void m09820_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -254,7 +254,7 @@ __kernel void m09820_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -295,7 +295,7 @@ __kernel void m09820_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, make_unicode (w0, w0_t, w1_t); make_unicode (w1, w2_t, w3_t); - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w0_t[0] = salt_buf[0]; w0_t[1] = salt_buf[1]; @@ -417,7 +417,7 @@ __kernel void m09820_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -487,7 +487,7 @@ __kernel void m09820_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -528,7 +528,7 @@ __kernel void m09820_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, make_unicode (w0, w0_t, w1_t); make_unicode (w1, w2_t, w3_t); - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, salt_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, salt_len); w0_t[0] = salt_buf[0]; w0_t[1] = salt_buf[1]; diff --git a/OpenCL/m09820_a3.cl b/OpenCL/m09820_a3.cl index 3b7148fb5..bd56b0a43 100644 --- a/OpenCL/m09820_a3.cl +++ b/OpenCL/m09820_a3.cl @@ -5,6 +5,8 @@ #define _OLDOFFICE34_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,34 +18,32 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" +#include "OpenCL/simd.c" -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" - -static void sha1_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[5]) +static void sha1_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[5]) { - u32 A = digest[0]; - u32 B = digest[1]; - u32 C = digest[2]; - u32 D = digest[3]; - u32 E = digest[4]; + u32x A = digest[0]; + u32x B = digest[1]; + u32x C = digest[2]; + u32x D = digest[3]; + u32x E = digest[4]; - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; + u32x w0_t = w0[0]; + u32x w1_t = w0[1]; + u32x w2_t = w0[2]; + u32x w3_t = w0[3]; + u32x w4_t = w1[0]; + u32x w5_t = w1[1]; + u32x w6_t = w1[2]; + u32x w7_t = w1[3]; + u32x w8_t = w2[0]; + u32x w9_t = w2[1]; + u32x wa_t = w2[2]; + u32x wb_t = w2[3]; + u32x wc_t = w3[0]; + u32x wd_t = w3[1]; + u32x we_t = w3[2]; + u32x wf_t = w3[3]; #undef K #define K SHA1C00 @@ -178,22 +178,22 @@ static void m09820m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - u32 w0_t[4]; - u32 w1_t[4]; - u32 w2_t[4]; - u32 w3_t[4]; + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; w0_t[0] = salt_buf[0]; w0_t[1] = salt_buf[1]; w0_t[2] = salt_buf[2]; w0_t[3] = salt_buf[3]; - w1_t[0] = w0[0]; + w1_t[0] = w0lr; w1_t[1] = w0[1]; w1_t[2] = w0[2]; w1_t[3] = w0[3]; @@ -206,7 +206,7 @@ static void m09820m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w3_t[2] = 0; w3_t[3] = pw_salt_len * 8; - u32 digest[5]; + u32x digest[5]; digest[0] = SHA1M_A; digest[1] = SHA1M_B; @@ -241,15 +241,12 @@ static void m09820m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); - u32 a = swap32 (digest[0]); - u32 b = swap32 (digest[1]) & 0xff; + u32x a = swap32 (digest[0]); + u32x b = swap32 (digest[1]) & 0xff; + u32x c = 0; + u32x d = 0; - const u32 r0 = a; - const u32 r1 = b; - const u32 r2 = 0; - const u32 r3 = 0; - - #include COMPARE_M + COMPARE_M_SIMD (a, b, c, d); } } @@ -295,22 +292,22 @@ static void m09820s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - u32 w0_t[4]; - u32 w1_t[4]; - u32 w2_t[4]; - u32 w3_t[4]; + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; w0_t[0] = salt_buf[0]; w0_t[1] = salt_buf[1]; w0_t[2] = salt_buf[2]; w0_t[3] = salt_buf[3]; - w1_t[0] = w0[0]; + w1_t[0] = w0lr; w1_t[1] = w0[1]; w1_t[2] = w0[2]; w1_t[3] = w0[3]; @@ -323,7 +320,7 @@ static void m09820s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w3_t[2] = 0; w3_t[3] = pw_salt_len * 8; - u32 digest[5]; + u32x digest[5]; digest[0] = SHA1M_A; digest[1] = SHA1M_B; @@ -358,15 +355,12 @@ static void m09820s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le sha1_transform (w0_t, w1_t, w2_t, w3_t, digest); - u32 a = swap32 (digest[0]); - u32 b = swap32 (digest[1]) & 0xff; + u32x a = swap32 (digest[0]); + u32x b = swap32 (digest[1]) & 0xff; + u32x c = 0; + u32x d = 0; - const u32 r0 = a; - const u32 r1 = b; - const u32 r2 = 0; - const u32 r3 = 0; - - #include COMPARE_S + COMPARE_S_SIMD (a, b, c, d); } } diff --git a/OpenCL/m09900_a1.cl b/OpenCL/m09900_a1.cl index c03c542d2..32759c119 100644 --- a/OpenCL/m09900_a1.cl +++ b/OpenCL/m09900_a1.cl @@ -68,7 +68,7 @@ __kernel void m09900_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -111,7 +111,7 @@ __kernel void m09900_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -385,7 +385,7 @@ __kernel void m09900_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -440,7 +440,7 @@ __kernel void m09900_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m09900_a3.cl b/OpenCL/m09900_a3.cl index 0c7ca3ab9..5cefe5042 100644 --- a/OpenCL/m09900_a3.cl +++ b/OpenCL/m09900_a3.cl @@ -5,6 +5,8 @@ #define _MD5_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,11 +18,9 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" +#include "OpenCL/simd.c" -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" - -static void m09900m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m09900m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -107,16 +107,16 @@ static void m09900m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32x a = MD5M_A; + u32x b = MD5M_B; + u32x c = MD5M_C; + u32x d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0, F_w0c00, MD5S00); MD5_STEP0(MD5_Fo, d, a, b, c, F_w1c01, MD5S01); @@ -191,15 +191,15 @@ static void m09900m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k c += MD5M_C; d += MD5M_D; - u32 r_a = a; - u32 r_b = b; - u32 r_c = c; - u32 r_d = d; + u32x r_a = a; + u32x r_b = b; + u32x r_c = c; + u32x r_d = d; - u32 t0[4]; - u32 t1[4]; - u32 t2[4]; - u32 t3[4]; + u32x t0[4]; + u32x t1[4]; + u32x t2[4]; + u32x t3[4]; t0[0] = 0; t0[1] = 0; @@ -291,16 +291,11 @@ static void m09900m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k c += r_c; d += r_d; - const u32 r0 = a; - const u32 r1 = d; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_M + COMPARE_M_SIMD (a, d, c, b); } } -static void m09900s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m09900s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -399,16 +394,16 @@ static void m09900s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32x a = MD5M_A; + u32x b = MD5M_B; + u32x c = MD5M_C; + u32x d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0, F_w0c00, MD5S00); MD5_STEP0(MD5_Fo, d, a, b, c, F_w1c01, MD5S01); @@ -483,15 +478,15 @@ static void m09900s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k c += MD5M_C; d += MD5M_D; - u32 r_a = a; - u32 r_b = b; - u32 r_c = c; - u32 r_d = d; + u32x r_a = a; + u32x r_b = b; + u32x r_c = c; + u32x r_d = d; - u32 t0[4]; - u32 t1[4]; - u32 t2[4]; - u32 t3[4]; + u32x t0[4]; + u32x t1[4]; + u32x t2[4]; + u32x t3[4]; t0[0] = 0; t0[1] = 0; @@ -575,7 +570,7 @@ static void m09900s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k MD5_STEP (MD5_I , b, c, d, a, t3[1], MD5C3b, MD5S33); MD5_STEP (MD5_I , a, b, c, d, t1[0], MD5C3c, MD5S30); - if (allx ((a + r_a) != search[0])) continue; + if (MATCHES_NONE_VS ((a + r_a), search[0])) continue; MD5_STEP (MD5_I , d, a, b, c, t2[3], MD5C3d, MD5S31); MD5_STEP (MD5_I , c, d, a, b, t0[2], MD5C3e, MD5S32); @@ -586,16 +581,11 @@ static void m09900s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k c += r_c; d += r_d; - const u32 r0 = a; - const u32 r1 = d; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_S + COMPARE_S_SIMD (a, d, c, b); } } -__kernel void m09900_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m09900_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -633,7 +623,7 @@ __kernel void m09900_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m09900m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m09900_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m09900_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -671,7 +661,7 @@ __kernel void m09900_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m09900m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m09900_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m09900_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -709,7 +699,7 @@ __kernel void m09900_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m09900m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m09900_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m09900_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -747,7 +737,7 @@ __kernel void m09900_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m09900s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m09900_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m09900_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -785,7 +775,7 @@ __kernel void m09900_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m09900s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m09900_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m09900_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base diff --git a/OpenCL/m10100_a1.cl b/OpenCL/m10100_a1.cl index 5e4a24ccc..7960896b2 100644 --- a/OpenCL/m10100_a1.cl +++ b/OpenCL/m10100_a1.cl @@ -84,7 +84,7 @@ __kernel void m10100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -141,7 +141,7 @@ __kernel void m10100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w[16]; @@ -264,7 +264,7 @@ __kernel void m10100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -333,7 +333,7 @@ __kernel void m10100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w[16]; diff --git a/OpenCL/m10100_a3.cl b/OpenCL/m10100_a3.cl index 2cc901210..076615cbe 100644 --- a/OpenCL/m10100_a3.cl +++ b/OpenCL/m10100_a3.cl @@ -5,6 +5,8 @@ #define _SIPHASH_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,15 +18,13 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" - -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" +#include "OpenCL/simd.c" #define SIPROUND(v0,v1,v2,v3) \ (v0) += (v1); \ (v1) = rotl64 ((v1), 13); \ (v1) ^= (v0); \ - (v0) = as_ulong (as_uint2 ((v0)).s10); \ + (v0) = rotl64 ((v0), 32); \ (v2) += (v3); \ (v3) = rotl64 ((v3), 16); \ (v3) ^= (v2); \ @@ -34,9 +34,9 @@ (v2) += (v1); \ (v1) = rotl64 ((v1), 17); \ (v1) ^= (v2); \ - (v2) = as_ulong (as_uint2 ((v2)).s10); + (v2) = rotl64 ((v2), 32) -static void m10100m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m10100m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -54,10 +54,10 @@ static void m10100m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k u64 v2p = SIPHASHM_2; u64 v3p = SIPHASHM_3; - v0p ^= hl32_to_64 (salt_bufs[salt_pos].salt_buf[1], salt_bufs[salt_pos].salt_buf[0]); - v1p ^= hl32_to_64 (salt_bufs[salt_pos].salt_buf[3], salt_bufs[salt_pos].salt_buf[2]); - v2p ^= hl32_to_64 (salt_bufs[salt_pos].salt_buf[1], salt_bufs[salt_pos].salt_buf[0]); - v3p ^= hl32_to_64 (salt_bufs[salt_pos].salt_buf[3], salt_bufs[salt_pos].salt_buf[2]); + v0p ^= hl32_to_64_S (salt_bufs[salt_pos].salt_buf[1], salt_bufs[salt_pos].salt_buf[0]); + v1p ^= hl32_to_64_S (salt_bufs[salt_pos].salt_buf[3], salt_bufs[salt_pos].salt_buf[2]); + v2p ^= hl32_to_64_S (salt_bufs[salt_pos].salt_buf[1], salt_bufs[salt_pos].salt_buf[0]); + v3p ^= hl32_to_64_S (salt_bufs[salt_pos].salt_buf[3], salt_bufs[salt_pos].salt_buf[2]); u64 *w_ptr = (u64 *) w; @@ -69,18 +69,18 @@ static void m10100m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - u64 v0 = v0p; - u64 v1 = v1p; - u64 v2 = v2p; - u64 v3 = v3p; + u64x v0 = v0p; + u64x v1 = v1p; + u64x v2 = v2p; + u64x v3 = v3p; - u64 m = hl32_to_64 (w[1], w0); + u64x m = hl32_to_64 (w[1], w0); v3 ^= m; @@ -111,21 +111,18 @@ static void m10100m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k SIPROUND (v0, v1, v2, v3); SIPROUND (v0, v1, v2, v3); - const u64 v = v0 ^ v1 ^ v2 ^ v3; + const u64x v = v0 ^ v1 ^ v2 ^ v3; - const u32 a = l32_from_64 (v); - const u32 b = h32_from_64 (v); + const u32x a = l32_from_64 (v); + const u32x b = h32_from_64 (v); + const u32x c = 0; + const u32x d = 0; - const u32 r0 = a; - const u32 r1 = b; - const u32 r2 = 0; - const u32 r3 = 0; - - #include COMPARE_M + COMPARE_M_SIMD (a, b, c, d); } } -static void m10100s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m10100s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -155,10 +152,10 @@ static void m10100s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k u64 v2p = SIPHASHM_2; u64 v3p = SIPHASHM_3; - v0p ^= hl32_to_64 (salt_bufs[salt_pos].salt_buf[1], salt_bufs[salt_pos].salt_buf[0]); - v1p ^= hl32_to_64 (salt_bufs[salt_pos].salt_buf[3], salt_bufs[salt_pos].salt_buf[2]); - v2p ^= hl32_to_64 (salt_bufs[salt_pos].salt_buf[1], salt_bufs[salt_pos].salt_buf[0]); - v3p ^= hl32_to_64 (salt_bufs[salt_pos].salt_buf[3], salt_bufs[salt_pos].salt_buf[2]); + v0p ^= hl32_to_64_S (salt_bufs[salt_pos].salt_buf[1], salt_bufs[salt_pos].salt_buf[0]); + v1p ^= hl32_to_64_S (salt_bufs[salt_pos].salt_buf[3], salt_bufs[salt_pos].salt_buf[2]); + v2p ^= hl32_to_64_S (salt_bufs[salt_pos].salt_buf[1], salt_bufs[salt_pos].salt_buf[0]); + v3p ^= hl32_to_64_S (salt_bufs[salt_pos].salt_buf[3], salt_bufs[salt_pos].salt_buf[2]); u64 *w_ptr = (u64 *) w; @@ -170,18 +167,18 @@ static void m10100s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - u64 v0 = v0p; - u64 v1 = v1p; - u64 v2 = v2p; - u64 v3 = v3p; + u64x v0 = v0p; + u64x v1 = v1p; + u64x v2 = v2p; + u64x v3 = v3p; - u64 m = hl32_to_64 (w[1], w0); + u64x m = hl32_to_64 (w[1], w0); v3 ^= m; @@ -212,21 +209,18 @@ static void m10100s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k SIPROUND (v0, v1, v2, v3); SIPROUND (v0, v1, v2, v3); - const u64 v = v0 ^ v1 ^ v2 ^ v3; + const u64x v = v0 ^ v1 ^ v2 ^ v3; - const u32 a = l32_from_64 (v); - const u32 b = h32_from_64 (v); + const u32x a = l32_from_64 (v); + const u32x b = h32_from_64 (v); + const u32x c = 0; + const u32x d = 0; - const u32 r0 = a; - const u32 r1 = b; - const u32 r2 = 0; - const u32 r3 = 0; - - #include COMPARE_S + COMPARE_S_SIMD (a, b, c, d); } } -__kernel void m10100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m10100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -264,7 +258,7 @@ __kernel void m10100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m10100m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m10100_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m10100_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -302,7 +296,7 @@ __kernel void m10100_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m10100m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m10100_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m10100_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -340,7 +334,7 @@ __kernel void m10100_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m10100m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m10100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m10100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -378,7 +372,7 @@ __kernel void m10100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m10100s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m10100_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m10100_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -416,7 +410,7 @@ __kernel void m10100_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m10100s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m10100_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m10100_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base diff --git a/OpenCL/m10300.cl b/OpenCL/m10300.cl index c1ffcbc5c..1ab9dbd5b 100644 --- a/OpenCL/m10300.cl +++ b/OpenCL/m10300.cl @@ -224,7 +224,7 @@ __kernel void m10300_init (__global pw_t *pws, __global kernel_rule_t *rules_buf w3[2] = 0; w3[3] = 0; - switch_buffer_by_offset (w0, w1, w2, w3, pw_len); + switch_buffer_by_offset_le (w0, w1, w2, w3, pw_len); w0[0] |= word_buf0[0]; w0[1] |= word_buf0[1]; diff --git a/OpenCL/m10400_a0.cl b/OpenCL/m10400_a0.cl index 672408c6a..bb0704885 100644 --- a/OpenCL/m10400_a0.cl +++ b/OpenCL/m10400_a0.cl @@ -358,7 +358,7 @@ __kernel void m10400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w3_t[2] = 0; w3_t[3] = 0; - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, pw_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, pw_len); // add password // truncate at 32 is wanted, not a bug! @@ -577,7 +577,7 @@ __kernel void m10400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w3_t[2] = 0; w3_t[3] = 0; - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, pw_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, pw_len); // add password // truncate at 32 is wanted, not a bug! diff --git a/OpenCL/m10400_a1.cl b/OpenCL/m10400_a1.cl index 86c38b14c..66fded4c8 100644 --- a/OpenCL/m10400_a1.cl +++ b/OpenCL/m10400_a1.cl @@ -280,7 +280,7 @@ __kernel void m10400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -355,7 +355,7 @@ __kernel void m10400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -410,7 +410,7 @@ __kernel void m10400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w3_t[2] = 0; w3_t[3] = 0; - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, pw_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, pw_len); // add password // truncate at 32 is wanted, not a bug! @@ -541,7 +541,7 @@ __kernel void m10400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -628,7 +628,7 @@ __kernel void m10400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -683,7 +683,7 @@ __kernel void m10400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w3_t[2] = 0; w3_t[3] = 0; - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, pw_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, pw_len); // add password // truncate at 32 is wanted, not a bug! diff --git a/OpenCL/m10400_a3.cl b/OpenCL/m10400_a3.cl index 7fadb69e0..5730bb926 100644 --- a/OpenCL/m10400_a3.cl +++ b/OpenCL/m10400_a3.cl @@ -303,7 +303,7 @@ static void m10400m (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ w3_t[2] = 0; w3_t[3] = 0; - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, pw_len); + switch_buffer_by_offset_le_S (w0_t, w1_t, w2_t, w3_t, pw_len); // add password // truncate at 32 is wanted, not a bug! @@ -461,7 +461,7 @@ static void m10400s (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ w3_t[2] = 0; w3_t[3] = 0; - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, pw_len); + switch_buffer_by_offset_le_S (w0_t, w1_t, w2_t, w3_t, pw_len); // add password // truncate at 32 is wanted, not a bug! diff --git a/OpenCL/m10410_a1.cl b/OpenCL/m10410_a1.cl index 3437cfee6..c1f4150f7 100644 --- a/OpenCL/m10410_a1.cl +++ b/OpenCL/m10410_a1.cl @@ -181,7 +181,7 @@ __kernel void m10410_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -227,7 +227,7 @@ __kernel void m10410_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[2]; @@ -315,7 +315,7 @@ __kernel void m10410_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -373,7 +373,7 @@ __kernel void m10410_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[2]; diff --git a/OpenCL/m10410_a3.cl b/OpenCL/m10410_a3.cl index 9924d3a16..53f9c0713 100644 --- a/OpenCL/m10410_a3.cl +++ b/OpenCL/m10410_a3.cl @@ -5,6 +5,8 @@ #define _MD5_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,9 +18,7 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" - -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" +#include "OpenCL/simd.c" __constant u32 padding[8] = { @@ -49,10 +49,10 @@ static void swap (__local RC4_KEY *rc4_key, const u8 i, const u8 j) rc4_key->S[j] = tmp; } -static void rc4_init_16 (__local RC4_KEY *rc4_key, const u32 data[4]) +static void rc4_init_16 (__local RC4_KEY *rc4_key, const u32x data[4]) { - u32 v = 0x03020100; - u32 a = 0x04040404; + u32x v = 0x03020100; + u32x a = 0x04040404; __local u32 *ptr = (__local u32 *) rc4_key->S; @@ -62,11 +62,11 @@ static void rc4_init_16 (__local RC4_KEY *rc4_key, const u32 data[4]) ptr[i] = v; v += a; } - const u32 d0 = data[0] >> 0; - const u32 d1 = data[0] >> 8; - const u32 d2 = data[0] >> 16; - const u32 d3 = data[0] >> 24; - const u32 d4 = data[1] >> 0; + const u32x d0 = data[0] >> 0; + const u32x d1 = data[0] >> 8; + const u32x d2 = data[0] >> 16; + const u32x d3 = data[0] >> 24; + const u32x d4 = data[1] >> 0; u32 j = 0; @@ -83,11 +83,11 @@ static void rc4_init_16 (__local RC4_KEY *rc4_key, const u32 data[4]) j += rc4_key->S[255] + d0; swap (rc4_key, 255, j); } -static u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, __constant u32 in[4], u32 out[4]) +static u8 rc4_next_16 (__local RC4_KEY *rc4_key, u8 i, u8 j, __constant u32x in[4], u32x out[4]) { for (u32 k = 0; k < 4; k++) { - u32 xor4 = 0; + u32x xor4 = 0; u8 idx; @@ -150,33 +150,28 @@ static void m10410m (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; // now the RC4 part - u32 key[4]; + u32x key[4]; - key[0] = w0[0]; + key[0] = w0lr; key[1] = w0[1]; key[2] = 0; key[3] = 0; rc4_init_16 (rc4_key, key); - u32 out[4]; + u32x out[4]; rc4_next_16 (rc4_key, 0, 0, padding, out); - const u32 r0 = out[0]; - const u32 r1 = out[1]; - const u32 r2 = out[2]; - const u32 r3 = out[3]; - - #include COMPARE_M + COMPARE_M_SIMD (out[0], out[1], out[2], out[3]); } } @@ -209,33 +204,28 @@ static void m10410s (__local RC4_KEY rc4_keys[64], u32 w0[4], u32 w1[4], u32 w2[ u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; // now the RC4 part - u32 key[4]; + u32x key[4]; - key[0] = w0[0]; + key[0] = w0lr; key[1] = w0[1]; key[2] = 0; key[3] = 0; rc4_init_16 (rc4_key, key); - u32 out[4]; + u32x out[4]; rc4_next_16 (rc4_key, 0, 0, padding, out); - const u32 r0 = out[0]; - const u32 r1 = out[1]; - const u32 r2 = out[2]; - const u32 r3 = out[3]; - - #include COMPARE_S + COMPARE_S_SIMD (out[0], out[1], out[2], out[3]); } } diff --git a/OpenCL/m10420_a0.cl b/OpenCL/m10420_a0.cl index 4a87fc962..a4f788c41 100644 --- a/OpenCL/m10420_a0.cl +++ b/OpenCL/m10420_a0.cl @@ -248,7 +248,7 @@ __kernel void m10420_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w3_t[2] = 0; w3_t[3] = 0; - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, pw_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, pw_len); // add password // truncate at 32 is wanted, not a bug! @@ -447,7 +447,7 @@ __kernel void m10420_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w3_t[2] = 0; w3_t[3] = 0; - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, pw_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, pw_len); // add password // truncate at 32 is wanted, not a bug! diff --git a/OpenCL/m10420_a1.cl b/OpenCL/m10420_a1.cl index 62a7949b3..17778ad4f 100644 --- a/OpenCL/m10420_a1.cl +++ b/OpenCL/m10420_a1.cl @@ -180,7 +180,7 @@ __kernel void m10420_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -247,7 +247,7 @@ __kernel void m10420_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -302,7 +302,7 @@ __kernel void m10420_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w3_t[2] = 0; w3_t[3] = 0; - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, pw_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, pw_len); // add password // truncate at 32 is wanted, not a bug! @@ -423,7 +423,7 @@ __kernel void m10420_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -502,7 +502,7 @@ __kernel void m10420_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -557,7 +557,7 @@ __kernel void m10420_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, w3_t[2] = 0; w3_t[3] = 0; - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, pw_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, pw_len); // add password // truncate at 32 is wanted, not a bug! diff --git a/OpenCL/m10420_a3.cl b/OpenCL/m10420_a3.cl index 14429c8b3..3e4191c8c 100644 --- a/OpenCL/m10420_a3.cl +++ b/OpenCL/m10420_a3.cl @@ -5,6 +5,8 @@ #define _MD5_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,9 +18,7 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" - -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" +#include "OpenCL/simd.c" __constant u32 padding[8] = { @@ -32,29 +32,29 @@ __constant u32 padding[8] = 0x7a695364 }; -static void md5_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u32 digest[4]) +static void md5_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u32x digest[4]) { - u32 a = digest[0]; - u32 b = digest[1]; - u32 c = digest[2]; - u32 d = digest[3]; + u32x a = digest[0]; + u32x b = digest[1]; + u32x c = digest[2]; + u32x d = digest[3]; - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = w3[2]; - u32 wf_t = w3[3]; + u32x w0_t = w0[0]; + u32x w1_t = w0[1]; + u32x w2_t = w0[2]; + u32x w3_t = w0[3]; + u32x w4_t = w1[0]; + u32x w5_t = w1[1]; + u32x w6_t = w1[2]; + u32x w7_t = w1[3]; + u32x w8_t = w2[0]; + u32x w9_t = w2[1]; + u32x wa_t = w2[2]; + u32x wb_t = w2[3]; + u32x wc_t = w3[0]; + u32x wd_t = w3[1]; + u32x we_t = w3[2]; + u32x wf_t = w3[3]; MD5_STEP (MD5_Fo, a, b, c, d, w0_t, MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w1_t, MD5C01, MD5S01); @@ -169,16 +169,16 @@ static void m10420m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - u32 w0_t[4]; - u32 w1_t[4]; - u32 w2_t[4]; - u32 w3_t[4]; + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; // max length supported by pdf11 is 32 @@ -199,13 +199,13 @@ static void m10420m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w3_t[2] = 0; w3_t[3] = 0; - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, pw_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, pw_len); // add password // truncate at 32 is wanted, not a bug! // add o_buf - w0_t[0] |= w0[0]; + w0_t[0] |= w0lr; w0_t[1] |= w0[1]; w0_t[2] |= w0[2]; w0_t[3] |= w0[3]; @@ -222,7 +222,7 @@ static void m10420m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w3_t[2] = o_buf[6]; w3_t[3] = o_buf[7]; - u32 digest[4]; + u32x digest[4]; digest[0] = MD5M_A; digest[1] = MD5M_B; @@ -250,15 +250,12 @@ static void m10420m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le md5_transform (w0_t, w1_t, w2_t, w3_t, digest); - u32 a = digest[0]; - u32 b = digest[1] & 0xff; + u32x a = digest[0]; + u32x b = digest[1] & 0xff; + u32x c = 0; + u32x d = 0; - const u32 r0 = a; - const u32 r1 = b; - const u32 r2 = 0; - const u32 r3 = 0; - - #include COMPARE_M + COMPARE_M_SIMD (a, b, c, d); } } @@ -313,16 +310,16 @@ static void m10420s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - u32 w0_t[4]; - u32 w1_t[4]; - u32 w2_t[4]; - u32 w3_t[4]; + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; // max length supported by pdf11 is 32 @@ -343,13 +340,13 @@ static void m10420s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w3_t[2] = 0; w3_t[3] = 0; - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, pw_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, pw_len); // add password // truncate at 32 is wanted, not a bug! // add o_buf - w0_t[0] |= w0[0]; + w0_t[0] |= w0lr; w0_t[1] |= w0[1]; w0_t[2] |= w0[2]; w0_t[3] |= w0[3]; @@ -366,7 +363,7 @@ static void m10420s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le w3_t[2] = o_buf[6]; w3_t[3] = o_buf[7]; - u32 digest[4]; + u32x digest[4]; digest[0] = MD5M_A; digest[1] = MD5M_B; @@ -394,15 +391,12 @@ static void m10420s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le md5_transform (w0_t, w1_t, w2_t, w3_t, digest); - u32 a = digest[0]; - u32 b = digest[1] & 0xff; + u32x a = digest[0]; + u32x b = digest[1] & 0xff; + u32x c = 0; + u32x d = 0; - const u32 r0 = a; - const u32 r1 = b; - const u32 r2 = 0; - const u32 r3 = 0; - - #include COMPARE_S + COMPARE_S_SIMD (a, b, c, d); } } diff --git a/OpenCL/m10500.cl b/OpenCL/m10500.cl index 14df5c7f1..4f6f1fe37 100644 --- a/OpenCL/m10500.cl +++ b/OpenCL/m10500.cl @@ -380,7 +380,7 @@ __kernel void m10500_init (__global pw_t *pws, __global kernel_rule_t *rules_buf w3_t[2] = 0; w3_t[3] = 0; - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, pw_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, pw_len); // add password // truncate at 32 is wanted, not a bug! diff --git a/OpenCL/m10800_a1.cl b/OpenCL/m10800_a1.cl index 066b503dc..c54793465 100644 --- a/OpenCL/m10800_a1.cl +++ b/OpenCL/m10800_a1.cl @@ -190,7 +190,7 @@ __kernel void m10800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -235,7 +235,7 @@ __kernel void m10800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordr0, wordr1, pw_r_len); - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -366,7 +366,7 @@ __kernel void m10800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -423,7 +423,7 @@ __kernel void m10800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordr0, wordr1, pw_r_len); - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m10800_a3.cl b/OpenCL/m10800_a3.cl index 6207ca9f4..82ec7a111 100644 --- a/OpenCL/m10800_a3.cl +++ b/OpenCL/m10800_a3.cl @@ -5,6 +5,8 @@ #define _SHA384_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,9 +18,7 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" - -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" +#include "OpenCL/simd.c" __constant u64 k_sha384[80] = { @@ -44,33 +44,33 @@ __constant u64 k_sha384[80] = SHA384C4c, SHA384C4d, SHA384C4e, SHA384C4f, }; -static void sha384_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], const u32 w3[4], u64 digest[8]) +static void sha384_transform (const u32x w0[4], const u32x w1[4], const u32x w2[4], const u32x w3[4], u64x digest[8]) { - u64 w0_t = hl32_to_64 (w0[0], w0[1]); - u64 w1_t = hl32_to_64 (w0[2], w0[3]); - u64 w2_t = hl32_to_64 (w1[0], w1[1]); - u64 w3_t = hl32_to_64 (w1[2], w1[3]); - u64 w4_t = hl32_to_64 (w2[0], w2[1]); - u64 w5_t = hl32_to_64 (w2[2], w2[3]); - u64 w6_t = hl32_to_64 (w3[0], w3[1]); - u64 w7_t = 0; - u64 w8_t = 0; - u64 w9_t = 0; - u64 wa_t = 0; - u64 wb_t = 0; - u64 wc_t = 0; - u64 wd_t = 0; - u64 we_t = 0; - u64 wf_t = hl32_to_64 (w3[2], w3[3]); + u64x w0_t = hl32_to_64 (w0[0], w0[1]); + u64x w1_t = hl32_to_64 (w0[2], w0[3]); + u64x w2_t = hl32_to_64 (w1[0], w1[1]); + u64x w3_t = hl32_to_64 (w1[2], w1[3]); + u64x w4_t = hl32_to_64 (w2[0], w2[1]); + u64x w5_t = hl32_to_64 (w2[2], w2[3]); + u64x w6_t = hl32_to_64 (w3[0], w3[1]); + u64x w7_t = 0; + u64x w8_t = 0; + u64x w9_t = 0; + u64x wa_t = 0; + u64x wb_t = 0; + u64x wc_t = 0; + u64x wd_t = 0; + u64x we_t = 0; + u64x wf_t = hl32_to_64 (w3[2], w3[3]); - u64 a = digest[0]; - u64 b = digest[1]; - u64 c = digest[2]; - u64 d = digest[3]; - u64 e = digest[4]; - u64 f = digest[5]; - u64 g = digest[6]; - u64 h = digest[7]; + u64x a = digest[0]; + u64x b = digest[1]; + u64x c = digest[2]; + u64x d = digest[3]; + u64x e = digest[4]; + u64x f = digest[5]; + u64x g = digest[6]; + u64x h = digest[7]; #define ROUND_EXPAND() \ { \ @@ -140,7 +140,7 @@ static void sha384_transform (const u32 w0[4], const u32 w1[4], const u32 w2[4], digest[7] = 0; } -static void m10800m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m10800m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -155,16 +155,16 @@ static void m10800m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - u32 w0_t[4]; - u32 w1_t[4]; - u32 w2_t[4]; - u32 w3_t[4]; + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; w0_t[0] = w0; w0_t[1] = w[ 1]; @@ -183,7 +183,7 @@ static void m10800m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k w3_t[2] = w[14]; w3_t[3] = w[15]; - u64 digest[8]; + u64x digest[8]; digest[0] = SHA384M_A; digest[1] = SHA384M_B; @@ -196,16 +196,16 @@ static void m10800m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k sha384_transform (w0_t, w1_t, w2_t, w3_t, digest); - const u32 r0 = l32_from_64 (digest[3]); - const u32 r1 = h32_from_64 (digest[3]); - const u32 r2 = l32_from_64 (digest[2]); - const u32 r3 = h32_from_64 (digest[2]); + const u32x r0 = l32_from_64 (digest[3]); + const u32x r1 = h32_from_64 (digest[3]); + const u32x r2 = l32_from_64 (digest[2]); + const u32x r3 = h32_from_64 (digest[2]); - #include COMPARE_M + COMPARE_M_SIMD (r0, r1, r2, r3); } } -static void m10800s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m10800s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -232,16 +232,16 @@ static void m10800s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - u32 w0_t[4]; - u32 w1_t[4]; - u32 w2_t[4]; - u32 w3_t[4]; + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; w0_t[0] = w0; w0_t[1] = w[ 1]; @@ -260,7 +260,7 @@ static void m10800s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k w3_t[2] = w[14]; w3_t[3] = w[15]; - u64 digest[8]; + u64x digest[8]; digest[0] = SHA384M_A; digest[1] = SHA384M_B; @@ -273,16 +273,16 @@ static void m10800s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k sha384_transform (w0_t, w1_t, w2_t, w3_t, digest); - const u32 r0 = l32_from_64 (digest[3]); - const u32 r1 = h32_from_64 (digest[3]); - const u32 r2 = l32_from_64 (digest[2]); - const u32 r3 = h32_from_64 (digest[2]); + const u32x r0 = l32_from_64 (digest[3]); + const u32x r1 = h32_from_64 (digest[3]); + const u32x r2 = l32_from_64 (digest[2]); + const u32x r3 = h32_from_64 (digest[2]); - #include COMPARE_S + COMPARE_S_SIMD (r0, r1, r2, r3); } } -__kernel void m10800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m10800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -320,7 +320,7 @@ __kernel void m10800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m10800m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m10800_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m10800_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -358,7 +358,7 @@ __kernel void m10800_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m10800m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m10800_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m10800_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -396,7 +396,7 @@ __kernel void m10800_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m10800m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m10800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m10800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -434,7 +434,7 @@ __kernel void m10800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m10800s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m10800_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m10800_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -472,7 +472,7 @@ __kernel void m10800_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m10800s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m10800_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m10800_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base diff --git a/OpenCL/m11000_a1.cl b/OpenCL/m11000_a1.cl index d5514f848..079873706 100644 --- a/OpenCL/m11000_a1.cl +++ b/OpenCL/m11000_a1.cl @@ -68,7 +68,7 @@ __kernel void m11000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -137,7 +137,7 @@ __kernel void m11000_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -443,7 +443,7 @@ __kernel void m11000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -524,7 +524,7 @@ __kernel void m11000_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m11000_a3.cl b/OpenCL/m11000_a3.cl index b12642cbc..e2fb91f21 100644 --- a/OpenCL/m11000_a3.cl +++ b/OpenCL/m11000_a3.cl @@ -5,6 +5,8 @@ #define _MD5_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,9 +18,7 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" - -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" +#include "OpenCL/simd.c" static void m11000m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { @@ -69,11 +69,11 @@ static void m11000m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; /** * prepend salt @@ -81,10 +81,10 @@ static void m11000m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le // first step fixed 56 bytes of salt - u32 w0_t[4]; - u32 w1_t[4]; - u32 w2_t[4]; - u32 w3_t[4]; + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; w0_t[0] = salt_buf0[0]; w0_t[1] = salt_buf0[1]; @@ -103,7 +103,7 @@ static void m11000m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le // after 56 byte salt, we have beginning of the password - w3_t[2] = w0[0]; + w3_t[2] = w0lr; w3_t[3] = w0[1]; /** @@ -112,10 +112,10 @@ static void m11000m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le // first transform - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32x a = MD5M_A; + u32x b = MD5M_B; + u32x c = MD5M_C; + u32x d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); @@ -190,10 +190,10 @@ static void m11000m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le c += MD5M_C; d += MD5M_D; - u32 r_a = a; - u32 r_b = b; - u32 r_c = c; - u32 r_d = d; + u32x r_a = a; + u32x r_b = b; + u32x r_c = c; + u32x r_d = d; // 2nd transform @@ -287,12 +287,7 @@ static void m11000m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le c += r_c; d += r_d; - const u32 r0 = a; - const u32 r1 = d; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_M + COMPARE_M_SIMD (a, d, c, b); } } @@ -357,11 +352,11 @@ static void m11000s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; /** * prepend salt @@ -369,10 +364,10 @@ static void m11000s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le // first step fixed 56 bytes of salt - u32 w0_t[4]; - u32 w1_t[4]; - u32 w2_t[4]; - u32 w3_t[4]; + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; w0_t[0] = salt_buf0[0]; w0_t[1] = salt_buf0[1]; @@ -391,7 +386,7 @@ static void m11000s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le // after 56 byte salt, we have beginning of the password - w3_t[2] = w0[0]; + w3_t[2] = w0lr; w3_t[3] = w0[1]; /** @@ -400,10 +395,10 @@ static void m11000s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le // first transform - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32x a = MD5M_A; + u32x b = MD5M_B; + u32x c = MD5M_C; + u32x d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); @@ -478,10 +473,10 @@ static void m11000s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le c += MD5M_C; d += MD5M_D; - u32 r_a = a; - u32 r_b = b; - u32 r_c = c; - u32 r_d = d; + u32x r_a = a; + u32x r_b = b; + u32x r_c = c; + u32x r_d = d; // 2nd transform @@ -575,12 +570,7 @@ static void m11000s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le c += r_c; d += r_d; - const u32 r0 = a; - const u32 r1 = d; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_S + COMPARE_S_SIMD (a, d, c, b); } } diff --git a/OpenCL/m11100_a0.cl b/OpenCL/m11100_a0.cl index 0d2fbeba5..fcd566c73 100644 --- a/OpenCL/m11100_a0.cl +++ b/OpenCL/m11100_a0.cl @@ -169,7 +169,7 @@ __kernel void m11100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * append the salt */ - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, pw_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, pw_len); const u32 pw_salt_len = out_len + salt_len; @@ -555,7 +555,7 @@ __kernel void m11100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * append the salt */ - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, pw_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, pw_len); const u32 pw_salt_len = out_len + salt_len; diff --git a/OpenCL/m11100_a1.cl b/OpenCL/m11100_a1.cl index b05419ddd..baca6621c 100644 --- a/OpenCL/m11100_a1.cl +++ b/OpenCL/m11100_a1.cl @@ -87,7 +87,7 @@ __kernel void m11100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -158,7 +158,7 @@ __kernel void m11100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0_t[4]; @@ -193,7 +193,7 @@ __kernel void m11100_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * append the salt */ - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, pw_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, pw_len); const u32 pw_salt_len = pw_len + salt_len; @@ -487,7 +487,7 @@ __kernel void m11100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -570,7 +570,7 @@ __kernel void m11100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0_t[4]; @@ -605,7 +605,7 @@ __kernel void m11100_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, * append the salt */ - switch_buffer_by_offset (w0_t, w1_t, w2_t, w3_t, pw_len); + switch_buffer_by_offset_le (w0_t, w1_t, w2_t, w3_t, pw_len); const u32 pw_salt_len = pw_len + salt_len; diff --git a/OpenCL/m11100_a3.cl b/OpenCL/m11100_a3.cl index d91efd5ba..603178314 100644 --- a/OpenCL/m11100_a3.cl +++ b/OpenCL/m11100_a3.cl @@ -5,6 +5,8 @@ #define _MD5_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,11 +18,17 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" +#include "OpenCL/simd.c" -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" - -#define uint_to_hex_lower8(i) l_bin2asc[(i)] +#if VECT_SIZE == 1 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#endif static void m11100m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 l_bin2asc[256]) { @@ -73,7 +81,7 @@ static void m11100m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le const u32 salt_len = salt_bufs[salt_pos].salt_len - 4; - switch_buffer_by_offset (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len); + switch_buffer_by_offset_le_S (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len); const u32 pw_salt_len = pw_len + salt_len; @@ -83,34 +91,34 @@ static void m11100m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - u32 w0_t[4]; + u32x w0_t[4]; - w0_t[0] = w0[0] | salt_buf0[0]; + w0_t[0] = w0lr | salt_buf0[0]; w0_t[1] = w0[1] | salt_buf0[1]; w0_t[2] = w0[2] | salt_buf0[2]; w0_t[3] = w0[3] | salt_buf0[3]; - u32 w1_t[4]; + u32x w1_t[4]; w1_t[0] = w1[0] | salt_buf1[0]; w1_t[1] = w1[1] | salt_buf1[1]; w1_t[2] = w1[2] | salt_buf1[2]; w1_t[3] = w1[3] | salt_buf1[3]; - u32 w2_t[4]; + u32x w2_t[4]; w2_t[0] = w2[0] | salt_buf2[0]; w2_t[1] = w2[1] | salt_buf2[1]; w2_t[2] = w2[2] | salt_buf2[2]; w2_t[3] = w2[3] | salt_buf2[3]; - u32 w3_t[4]; + u32x w3_t[4]; w3_t[0] = w3[0] | salt_buf3[0]; w3_t[1] = w3[1] | salt_buf3[1]; @@ -121,10 +129,10 @@ static void m11100m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * md5 ($pass.$salt) */ - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32x a = MD5M_A; + u32x b = MD5M_B; + u32x c = MD5M_C; + u32x d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); @@ -305,12 +313,7 @@ static void m11100m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - const u32 r0 = a; - const u32 r1 = d; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_M + COMPARE_M_SIMD (a, d, c, b); } } @@ -377,7 +380,7 @@ static void m11100s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le const u32 salt_len = salt_bufs[salt_pos].salt_len - 4; - switch_buffer_by_offset (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len); + switch_buffer_by_offset_le_S (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len); const u32 pw_salt_len = pw_len + salt_len; @@ -387,34 +390,34 @@ static void m11100s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; - u32 w0_t[4]; + u32x w0_t[4]; - w0_t[0] = w0[0] | salt_buf0[0]; + w0_t[0] = w0lr | salt_buf0[0]; w0_t[1] = w0[1] | salt_buf0[1]; w0_t[2] = w0[2] | salt_buf0[2]; w0_t[3] = w0[3] | salt_buf0[3]; - u32 w1_t[4]; + u32x w1_t[4]; w1_t[0] = w1[0] | salt_buf1[0]; w1_t[1] = w1[1] | salt_buf1[1]; w1_t[2] = w1[2] | salt_buf1[2]; w1_t[3] = w1[3] | salt_buf1[3]; - u32 w2_t[4]; + u32x w2_t[4]; w2_t[0] = w2[0] | salt_buf2[0]; w2_t[1] = w2[1] | salt_buf2[1]; w2_t[2] = w2[2] | salt_buf2[2]; w2_t[3] = w2[3] | salt_buf2[3]; - u32 w3_t[4]; + u32x w3_t[4]; w3_t[0] = w3[0] | salt_buf3[0]; w3_t[1] = w3[1] | salt_buf3[1]; @@ -425,10 +428,10 @@ static void m11100s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le * md5 ($pass.$salt) */ - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32x a = MD5M_A; + u32x b = MD5M_B; + u32x c = MD5M_C; + u32x d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); @@ -609,12 +612,7 @@ static void m11100s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le MD5_STEP (MD5_I , c, d, a, b, w0_t[2], MD5C3e, MD5S32); MD5_STEP (MD5_I , b, c, d, a, w2_t[1], MD5C3f, MD5S33); - const u32 r0 = a; - const u32 r1 = d; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_S + COMPARE_S_SIMD (a, d, c, b); } } diff --git a/OpenCL/m11200_a1.cl b/OpenCL/m11200_a1.cl index 98620462e..830761f45 100644 --- a/OpenCL/m11200_a1.cl +++ b/OpenCL/m11200_a1.cl @@ -70,7 +70,7 @@ __kernel void m11200_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -127,7 +127,7 @@ __kernel void m11200_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordr0, wordr1, pw_r_len); - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -619,7 +619,7 @@ __kernel void m11200_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -688,7 +688,7 @@ __kernel void m11200_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordr0, wordr1, pw_r_len); - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m11200_a3.cl b/OpenCL/m11200_a3.cl index ac0de34fa..8f110bae9 100644 --- a/OpenCL/m11200_a3.cl +++ b/OpenCL/m11200_a3.cl @@ -5,6 +5,8 @@ #define _SHA1_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,9 +18,7 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" - -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" +#include "OpenCL/simd.c" static void m11200m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { @@ -35,11 +35,11 @@ static void m11200m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 salt_buf[5]; - salt_buf[0] = swap32 (salt_bufs[salt_pos].salt_buf[0]); - salt_buf[1] = swap32 (salt_bufs[salt_pos].salt_buf[1]); - salt_buf[2] = swap32 (salt_bufs[salt_pos].salt_buf[2]); - salt_buf[3] = swap32 (salt_bufs[salt_pos].salt_buf[3]); - salt_buf[4] = swap32 (salt_bufs[salt_pos].salt_buf[4]); + salt_buf[0] = swap32_S (salt_bufs[salt_pos].salt_buf[0]); + salt_buf[1] = swap32_S (salt_bufs[salt_pos].salt_buf[1]); + salt_buf[2] = swap32_S (salt_bufs[salt_pos].salt_buf[2]); + salt_buf[3] = swap32_S (salt_bufs[salt_pos].salt_buf[3]); + salt_buf[4] = swap32_S (salt_bufs[salt_pos].salt_buf[4]); /** * loop @@ -47,38 +47,38 @@ static void m11200m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; /** * sha1 ($pass) */ - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = 0; - u32 wf_t = pw_len * 8; + u32x w0_t = w0lr; + u32x w1_t = w0[1]; + u32x w2_t = w0[2]; + u32x w3_t = w0[3]; + u32x w4_t = w1[0]; + u32x w5_t = w1[1]; + u32x w6_t = w1[2]; + u32x w7_t = w1[3]; + u32x w8_t = w2[0]; + u32x w9_t = w2[1]; + u32x wa_t = w2[2]; + u32x wb_t = w2[3]; + u32x wc_t = w3[0]; + u32x wd_t = w3[1]; + u32x we_t = 0; + u32x wf_t = pw_len * 8; - u32 a = SHA1M_A; - u32 b = SHA1M_B; - u32 c = SHA1M_C; - u32 d = SHA1M_D; - u32 e = SHA1M_E; + u32x a = SHA1M_A; + u32x b = SHA1M_B; + u32x c = SHA1M_C; + u32x d = SHA1M_D; + u32x e = SHA1M_E; #undef K #define K SHA1C00 @@ -176,11 +176,11 @@ static void m11200m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we_t); wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf_t); - u32 plain_sha1_a = a + SHA1M_A; - u32 plain_sha1_b = b + SHA1M_B; - u32 plain_sha1_c = c + SHA1M_C; - u32 plain_sha1_d = d + SHA1M_D; - u32 plain_sha1_e = e + SHA1M_E; + u32x plain_sha1_a = a + SHA1M_A; + u32x plain_sha1_b = b + SHA1M_B; + u32x plain_sha1_c = c + SHA1M_C; + u32x plain_sha1_d = d + SHA1M_D; + u32x plain_sha1_e = e + SHA1M_E; /** * sha1 (sha1 ($pass)) @@ -447,12 +447,7 @@ static void m11200m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le d ^= plain_sha1_d; e ^= plain_sha1_e; - const u32 r0 = d; - const u32 r1 = e; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_M + COMPARE_M_SIMD (d, e, c, b); } } @@ -483,11 +478,11 @@ static void m11200s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 salt_buf[5]; - salt_buf[0] = swap32 (salt_bufs[salt_pos].salt_buf[0]); - salt_buf[1] = swap32 (salt_bufs[salt_pos].salt_buf[1]); - salt_buf[2] = swap32 (salt_bufs[salt_pos].salt_buf[2]); - salt_buf[3] = swap32 (salt_bufs[salt_pos].salt_buf[3]); - salt_buf[4] = swap32 (salt_bufs[salt_pos].salt_buf[4]); + salt_buf[0] = swap32_S (salt_bufs[salt_pos].salt_buf[0]); + salt_buf[1] = swap32_S (salt_bufs[salt_pos].salt_buf[1]); + salt_buf[2] = swap32_S (salt_bufs[salt_pos].salt_buf[2]); + salt_buf[3] = swap32_S (salt_bufs[salt_pos].salt_buf[3]); + salt_buf[4] = swap32_S (salt_bufs[salt_pos].salt_buf[4]); /** * loop @@ -495,38 +490,38 @@ static void m11200s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; /** * sha1 ($pass) */ - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = 0; - u32 wf_t = pw_len * 8; + u32x w0_t = w0lr; + u32x w1_t = w0[1]; + u32x w2_t = w0[2]; + u32x w3_t = w0[3]; + u32x w4_t = w1[0]; + u32x w5_t = w1[1]; + u32x w6_t = w1[2]; + u32x w7_t = w1[3]; + u32x w8_t = w2[0]; + u32x w9_t = w2[1]; + u32x wa_t = w2[2]; + u32x wb_t = w2[3]; + u32x wc_t = w3[0]; + u32x wd_t = w3[1]; + u32x we_t = 0; + u32x wf_t = pw_len * 8; - u32 a = SHA1M_A; - u32 b = SHA1M_B; - u32 c = SHA1M_C; - u32 d = SHA1M_D; - u32 e = SHA1M_E; + u32x a = SHA1M_A; + u32x b = SHA1M_B; + u32x c = SHA1M_C; + u32x d = SHA1M_D; + u32x e = SHA1M_E; #undef K #define K SHA1C00 @@ -624,11 +619,11 @@ static void m11200s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le we_t = rotl32 ((wb_t ^ w6_t ^ w0_t ^ we_t), 1u); SHA1_STEP (SHA1_F1, c, d, e, a, b, we_t); wf_t = rotl32 ((wc_t ^ w7_t ^ w1_t ^ wf_t), 1u); SHA1_STEP (SHA1_F1, b, c, d, e, a, wf_t); - u32 plain_sha1_a = a + SHA1M_A; - u32 plain_sha1_b = b + SHA1M_B; - u32 plain_sha1_c = c + SHA1M_C; - u32 plain_sha1_d = d + SHA1M_D; - u32 plain_sha1_e = e + SHA1M_E; + u32x plain_sha1_a = a + SHA1M_A; + u32x plain_sha1_b = b + SHA1M_B; + u32x plain_sha1_c = c + SHA1M_C; + u32x plain_sha1_d = d + SHA1M_D; + u32x plain_sha1_e = e + SHA1M_E; /** * sha1 (sha1 ($pass)) @@ -895,12 +890,7 @@ static void m11200s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le d ^= plain_sha1_d; e ^= plain_sha1_e; - const u32 r0 = d; - const u32 r1 = e; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_S + COMPARE_S_SIMD (d, e, c, b); } } diff --git a/OpenCL/m11300.cl b/OpenCL/m11300.cl index d862cbe67..9c61cba83 100644 --- a/OpenCL/m11300.cl +++ b/OpenCL/m11300.cl @@ -1074,7 +1074,7 @@ __kernel void m11300_init (__global pw_t *pws, __global kernel_rule_t *rules_buf u32 salt_len = salt_bufs[salt_pos].salt_len; - switch_buffer_by_offset (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len); + switch_buffer_by_offset_le (salt_buf0, salt_buf1, salt_buf2, salt_buf3, pw_len); w0[0] |= salt_buf0[0]; w0[1] |= salt_buf0[1]; diff --git a/OpenCL/m11400_a1.cl b/OpenCL/m11400_a1.cl index 741a60f68..562451c31 100644 --- a/OpenCL/m11400_a1.cl +++ b/OpenCL/m11400_a1.cl @@ -810,7 +810,7 @@ __kernel void m11400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -963,7 +963,7 @@ __kernel void m11400_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -1671,7 +1671,7 @@ __kernel void m11400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -1836,7 +1836,7 @@ __kernel void m11400_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m11400_a3.cl b/OpenCL/m11400_a3.cl index 84ca9329d..d4e3e3673 100644 --- a/OpenCL/m11400_a3.cl +++ b/OpenCL/m11400_a3.cl @@ -5,6 +5,8 @@ #define _MD5_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,13 +18,19 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" +#include "OpenCL/simd.c" -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" +#if VECT_SIZE == 1 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_lower8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#endif -#define uint_to_hex_lower8(i) l_bin2asc[(i)] - -static u32 memcat32 (u32 block0[16], u32 block1[16], const u32 block_len, const u32 append0[4], const u32 append1[4], const u32 append2[4], const u32 append3[4], const u32 append_len) +static u32 memcat32 (u32x block0[16], u32x block1[16], const u32 block_len, const u32x append0[4], const u32x append1[4], const u32x append2[4], const u32x append3[4], const u32 append_len) { const u32 mod = block_len & 3; const u32 div = block_len / 4; @@ -30,35 +38,35 @@ static u32 memcat32 (u32 block0[16], u32 block1[16], const u32 block_len, const #if defined IS_AMD || defined IS_GENERIC const int offset_minus_4 = 4 - mod; - u32 append0_t[4]; + u32x append0_t[4]; append0_t[0] = amd_bytealign (append0[0], 0, offset_minus_4); append0_t[1] = amd_bytealign (append0[1], append0[0], offset_minus_4); append0_t[2] = amd_bytealign (append0[2], append0[1], offset_minus_4); append0_t[3] = amd_bytealign (append0[3], append0[2], offset_minus_4); - u32 append1_t[4]; + u32x append1_t[4]; append1_t[0] = amd_bytealign (append1[0], append0[3], offset_minus_4); append1_t[1] = amd_bytealign (append1[1], append1[0], offset_minus_4); append1_t[2] = amd_bytealign (append1[2], append1[1], offset_minus_4); append1_t[3] = amd_bytealign (append1[3], append1[2], offset_minus_4); - u32 append2_t[4]; + u32x append2_t[4]; append2_t[0] = amd_bytealign (append2[0], append1[3], offset_minus_4); append2_t[1] = amd_bytealign (append2[1], append2[0], offset_minus_4); append2_t[2] = amd_bytealign (append2[2], append2[1], offset_minus_4); append2_t[3] = amd_bytealign (append2[3], append2[2], offset_minus_4); - u32 append3_t[4]; + u32x append3_t[4]; append3_t[0] = amd_bytealign (append3[0], append2[3], offset_minus_4); append3_t[1] = amd_bytealign (append3[1], append3[0], offset_minus_4); append3_t[2] = amd_bytealign (append3[2], append3[1], offset_minus_4); append3_t[3] = amd_bytealign (append3[3], append3[2], offset_minus_4); - u32 append4_t[4]; + u32x append4_t[4]; append4_t[0] = amd_bytealign ( 0, append3[3], offset_minus_4); append4_t[1] = 0; @@ -100,35 +108,35 @@ static u32 memcat32 (u32 block0[16], u32 block1[16], const u32 block_len, const const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - u32 append0_t[4]; + u32x append0_t[4]; append0_t[0] = __byte_perm ( 0, append0[0], selector); append0_t[1] = __byte_perm (append0[0], append0[1], selector); append0_t[2] = __byte_perm (append0[1], append0[2], selector); append0_t[3] = __byte_perm (append0[2], append0[3], selector); - u32 append1_t[4]; + u32x append1_t[4]; append1_t[0] = __byte_perm (append0[3], append1[0], selector); append1_t[1] = __byte_perm (append1[0], append1[1], selector); append1_t[2] = __byte_perm (append1[1], append1[2], selector); append1_t[3] = __byte_perm (append1[2], append1[3], selector); - u32 append2_t[4]; + u32x append2_t[4]; append2_t[0] = __byte_perm (append1[3], append2[0], selector); append2_t[1] = __byte_perm (append2[0], append2[1], selector); append2_t[2] = __byte_perm (append2[1], append2[2], selector); append2_t[3] = __byte_perm (append2[2], append2[3], selector); - u32 append3_t[4]; + u32x append3_t[4]; append3_t[0] = __byte_perm (append2[3], append3[0], selector); append3_t[1] = __byte_perm (append3[0], append3[1], selector); append3_t[2] = __byte_perm (append3[1], append3[2], selector); append3_t[3] = __byte_perm (append3[2], append3[3], selector); - u32 append4_t[4]; + u32x append4_t[4]; append4_t[0] = __byte_perm (append3[3], 0, selector); append4_t[1] = 0; @@ -862,11 +870,11 @@ static void m11400m_0_0 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; /* * HA1 = md5 ($salt . $pass) @@ -874,7 +882,7 @@ static void m11400m_0_0 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p // append the pass to the salt - u32 block0[16]; + u32x block0[16]; block0[ 0] = salt_buf0[ 0]; block0[ 1] = salt_buf0[ 1]; @@ -893,7 +901,7 @@ static void m11400m_0_0 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p block0[14] = salt_buf0[14]; block0[15] = salt_buf0[15]; - u32 block1[16]; + u32x block1[16]; block1[ 0] = salt_buf1[ 0]; block1[ 1] = salt_buf1[ 1]; @@ -912,31 +920,48 @@ static void m11400m_0_0 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p block1[14] = salt_buf1[14]; block1[15] = salt_buf1[15]; - memcat32 (block0, block1, salt_len, w0, w1, w2, w3, pw_len); + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; - u32 w0_t[4]; + w0_t[0] = w0lr; + w0_t[1] = w0[1]; + w0_t[2] = w0[2]; + w0_t[3] = w0[3]; + + w1_t[0] = w1[0]; + w1_t[1] = w1[1]; + w1_t[2] = w1[2]; + w1_t[3] = w1[3]; + + w2_t[0] = w2[0]; + w2_t[1] = w2[1]; + w2_t[2] = w2[2]; + w2_t[3] = w2[3]; + + w3_t[0] = w3[0]; + w3_t[1] = w3[1]; + w3_t[2] = w3[2]; + w3_t[3] = w3[3]; + + memcat32 (block0, block1, salt_len, w0_t, w1_t, w2_t, w3_t, pw_len); w0_t[0] = block0[ 0]; w0_t[1] = block0[ 1]; w0_t[2] = block0[ 2]; w0_t[3] = block0[ 3]; - u32 w1_t[4]; - w1_t[0] = block0[ 4]; w1_t[1] = block0[ 5]; w1_t[2] = block0[ 6]; w1_t[3] = block0[ 7]; - u32 w2_t[4]; - w2_t[0] = block0[ 8]; w2_t[1] = block0[ 9]; w2_t[2] = block0[10]; w2_t[3] = block0[11]; - u32 w3_t[4]; - w3_t[0] = block0[12]; w3_t[1] = block0[13]; w3_t[2] = pw_salt_len * 8; @@ -944,12 +969,12 @@ static void m11400m_0_0 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p // md5 - u32 tmp2; + u32x tmp2; - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32x a = MD5M_A; + u32x b = MD5M_B; + u32x c = MD5M_C; + u32x d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); @@ -1137,10 +1162,10 @@ static void m11400m_0_0 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p c += MD5M_C; d += MD5M_D; - u32 r_a = a; - u32 r_b = b; - u32 r_c = c; - u32 r_d = d; + u32x r_a = a; + u32x r_b = b; + u32x r_c = c; + u32x r_d = d; // 2nd transform @@ -1237,12 +1262,7 @@ static void m11400m_0_0 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p c += r_c; d += r_d; - const u32 r0 = a; - const u32 r1 = d; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_S + COMPARE_S_SIMD (a, d, c, b); } } @@ -1384,11 +1404,11 @@ static void m11400m_0_1 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; /* * HA1 = md5 ($salt . $pass) @@ -1396,7 +1416,7 @@ static void m11400m_0_1 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p // append the pass to the salt - u32 block0[16]; + u32x block0[16]; block0[ 0] = salt_buf0[ 0]; block0[ 1] = salt_buf0[ 1]; @@ -1415,7 +1435,7 @@ static void m11400m_0_1 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p block0[14] = salt_buf0[14]; block0[15] = salt_buf0[15]; - u32 block1[16]; + u32x block1[16]; block1[ 0] = salt_buf1[ 0]; block1[ 1] = salt_buf1[ 1]; @@ -1434,31 +1454,48 @@ static void m11400m_0_1 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p block1[14] = salt_buf1[14]; block1[15] = salt_buf1[15]; - memcat32 (block0, block1, salt_len, w0, w1, w2, w3, pw_len); + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; - u32 w0_t[4]; + w0_t[0] = w0lr; + w0_t[1] = w0[1]; + w0_t[2] = w0[2]; + w0_t[3] = w0[3]; + + w1_t[0] = w1[0]; + w1_t[1] = w1[1]; + w1_t[2] = w1[2]; + w1_t[3] = w1[3]; + + w2_t[0] = w2[0]; + w2_t[1] = w2[1]; + w2_t[2] = w2[2]; + w2_t[3] = w2[3]; + + w3_t[0] = w3[0]; + w3_t[1] = w3[1]; + w3_t[2] = w3[2]; + w3_t[3] = w3[3]; + + memcat32 (block0, block1, salt_len, w0_t, w1_t, w2_t, w3_t, pw_len); w0_t[0] = block0[ 0]; w0_t[1] = block0[ 1]; w0_t[2] = block0[ 2]; w0_t[3] = block0[ 3]; - u32 w1_t[4]; - w1_t[0] = block0[ 4]; w1_t[1] = block0[ 5]; w1_t[2] = block0[ 6]; w1_t[3] = block0[ 7]; - u32 w2_t[4]; - w2_t[0] = block0[ 8]; w2_t[1] = block0[ 9]; w2_t[2] = block0[10]; w2_t[3] = block0[11]; - u32 w3_t[4]; - w3_t[0] = block0[12]; w3_t[1] = block0[13]; w3_t[2] = pw_salt_len * 8; @@ -1466,12 +1503,12 @@ static void m11400m_0_1 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p // md5 - u32 tmp2; + u32x tmp2; - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32x a = MD5M_A; + u32x b = MD5M_B; + u32x c = MD5M_C; + u32x d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); @@ -1659,10 +1696,10 @@ static void m11400m_0_1 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p c += MD5M_C; d += MD5M_D; - u32 r_a = a; - u32 r_b = b; - u32 r_c = c; - u32 r_d = d; + u32x r_a = a; + u32x r_b = b; + u32x r_c = c; + u32x r_d = d; // 2nd transform @@ -1859,12 +1896,7 @@ static void m11400m_0_1 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p c += r_c; d += r_d; - const u32 r0 = a; - const u32 r1 = d; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_S + COMPARE_S_SIMD (a, d, c, b); } } @@ -1987,11 +2019,11 @@ static void m11400m_1_0 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; /* * HA1 = md5 ($salt . $pass) @@ -1999,7 +2031,7 @@ static void m11400m_1_0 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p // append the pass to the salt - u32 block0[16]; + u32x block0[16]; block0[ 0] = salt_buf0[ 0]; block0[ 1] = salt_buf0[ 1]; @@ -2018,7 +2050,7 @@ static void m11400m_1_0 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p block0[14] = salt_buf0[14]; block0[15] = salt_buf0[15]; - u32 block1[16]; + u32x block1[16]; block1[ 0] = salt_buf1[ 0]; block1[ 1] = salt_buf1[ 1]; @@ -2037,31 +2069,48 @@ static void m11400m_1_0 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p block1[14] = salt_buf1[14]; block1[15] = salt_buf1[15]; - memcat32 (block0, block1, salt_len, w0, w1, w2, w3, pw_len); + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; - u32 w0_t[4]; + w0_t[0] = w0lr; + w0_t[1] = w0[1]; + w0_t[2] = w0[2]; + w0_t[3] = w0[3]; + + w1_t[0] = w1[0]; + w1_t[1] = w1[1]; + w1_t[2] = w1[2]; + w1_t[3] = w1[3]; + + w2_t[0] = w2[0]; + w2_t[1] = w2[1]; + w2_t[2] = w2[2]; + w2_t[3] = w2[3]; + + w3_t[0] = w3[0]; + w3_t[1] = w3[1]; + w3_t[2] = w3[2]; + w3_t[3] = w3[3]; + + memcat32 (block0, block1, salt_len, w0_t, w1_t, w2_t, w3_t, pw_len); w0_t[0] = block0[ 0]; w0_t[1] = block0[ 1]; w0_t[2] = block0[ 2]; w0_t[3] = block0[ 3]; - u32 w1_t[4]; - w1_t[0] = block0[ 4]; w1_t[1] = block0[ 5]; w1_t[2] = block0[ 6]; w1_t[3] = block0[ 7]; - u32 w2_t[4]; - w2_t[0] = block0[ 8]; w2_t[1] = block0[ 9]; w2_t[2] = block0[10]; w2_t[3] = block0[11]; - u32 w3_t[4]; - w3_t[0] = block0[12]; w3_t[1] = block0[13]; w3_t[2] = block0[14]; @@ -2069,12 +2118,12 @@ static void m11400m_1_0 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p // md5 - u32 tmp2; + u32x tmp2; - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32x a = MD5M_A; + u32x b = MD5M_B; + u32x c = MD5M_C; + u32x d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); @@ -2149,10 +2198,10 @@ static void m11400m_1_0 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p c += MD5M_C; d += MD5M_D; - u32 r_a = a; - u32 r_b = b; - u32 r_c = c; - u32 r_d = d; + u32x r_a = a; + u32x r_b = b; + u32x r_c = c; + u32x r_d = d; w0_t[0] = block1[ 0]; w0_t[1] = block1[ 1]; @@ -2460,12 +2509,7 @@ static void m11400m_1_0 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p c += r_c; d += r_d; - const u32 r0 = a; - const u32 r1 = d; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_S + COMPARE_S_SIMD (a, d, c, b); } } @@ -2607,11 +2651,11 @@ static void m11400m_1_1 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; /* * HA1 = md5 ($salt . $pass) @@ -2619,7 +2663,7 @@ static void m11400m_1_1 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p // append the pass to the salt - u32 block0[16]; + u32x block0[16]; block0[ 0] = salt_buf0[ 0]; block0[ 1] = salt_buf0[ 1]; @@ -2657,31 +2701,48 @@ static void m11400m_1_1 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p block1[14] = salt_buf1[14]; block1[15] = salt_buf1[15]; - memcat32 (block0, block1, salt_len, w0, w1, w2, w3, pw_len); + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; - u32 w0_t[4]; + w0_t[0] = w0lr; + w0_t[1] = w0[1]; + w0_t[2] = w0[2]; + w0_t[3] = w0[3]; + + w1_t[0] = w1[0]; + w1_t[1] = w1[1]; + w1_t[2] = w1[2]; + w1_t[3] = w1[3]; + + w2_t[0] = w2[0]; + w2_t[1] = w2[1]; + w2_t[2] = w2[2]; + w2_t[3] = w2[3]; + + w3_t[0] = w3[0]; + w3_t[1] = w3[1]; + w3_t[2] = w3[2]; + w3_t[3] = w3[3]; + + memcat32 (block0, block1, salt_len, w0_t, w1_t, w2_t, w3_t, pw_len); w0_t[0] = block0[ 0]; w0_t[1] = block0[ 1]; w0_t[2] = block0[ 2]; w0_t[3] = block0[ 3]; - u32 w1_t[4]; - w1_t[0] = block0[ 4]; w1_t[1] = block0[ 5]; w1_t[2] = block0[ 6]; w1_t[3] = block0[ 7]; - u32 w2_t[4]; - w2_t[0] = block0[ 8]; w2_t[1] = block0[ 9]; w2_t[2] = block0[10]; w2_t[3] = block0[11]; - u32 w3_t[4]; - w3_t[0] = block0[12]; w3_t[1] = block0[13]; w3_t[2] = block0[14]; @@ -2689,12 +2750,12 @@ static void m11400m_1_1 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p // md5 - u32 tmp2; + u32x tmp2; - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32x a = MD5M_A; + u32x b = MD5M_B; + u32x c = MD5M_C; + u32x d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); @@ -2769,10 +2830,10 @@ static void m11400m_1_1 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p c += MD5M_C; d += MD5M_D; - u32 r_a = a; - u32 r_b = b; - u32 r_c = c; - u32 r_d = d; + u32x r_a = a; + u32x r_b = b; + u32x r_c = c; + u32x r_d = d; w0_t[0] = block1[ 0]; w0_t[1] = block1[ 1]; @@ -3180,12 +3241,7 @@ static void m11400m_1_1 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p c += r_c; d += r_d; - const u32 r0 = a; - const u32 r1 = d; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_S + COMPARE_S_SIMD (a, d, c, b); } } @@ -3308,11 +3364,11 @@ static void m11400s_0_0 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; /* * HA1 = md5 ($salt . $pass) @@ -3320,7 +3376,7 @@ static void m11400s_0_0 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p // append the pass to the salt - u32 block0[16]; + u32x block0[16]; block0[ 0] = salt_buf0[ 0]; block0[ 1] = salt_buf0[ 1]; @@ -3339,7 +3395,7 @@ static void m11400s_0_0 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p block0[14] = salt_buf0[14]; block0[15] = salt_buf0[15]; - u32 block1[16]; + u32x block1[16]; block1[ 0] = salt_buf1[ 0]; block1[ 1] = salt_buf1[ 1]; @@ -3358,31 +3414,48 @@ static void m11400s_0_0 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p block1[14] = salt_buf1[14]; block1[15] = salt_buf1[15]; - memcat32 (block0, block1, salt_len, w0, w1, w2, w3, pw_len); + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; - u32 w0_t[4]; + w0_t[0] = w0lr; + w0_t[1] = w0[1]; + w0_t[2] = w0[2]; + w0_t[3] = w0[3]; + + w1_t[0] = w1[0]; + w1_t[1] = w1[1]; + w1_t[2] = w1[2]; + w1_t[3] = w1[3]; + + w2_t[0] = w2[0]; + w2_t[1] = w2[1]; + w2_t[2] = w2[2]; + w2_t[3] = w2[3]; + + w3_t[0] = w3[0]; + w3_t[1] = w3[1]; + w3_t[2] = w3[2]; + w3_t[3] = w3[3]; + + memcat32 (block0, block1, salt_len, w0_t, w1_t, w2_t, w3_t, pw_len); w0_t[0] = block0[ 0]; w0_t[1] = block0[ 1]; w0_t[2] = block0[ 2]; w0_t[3] = block0[ 3]; - u32 w1_t[4]; - w1_t[0] = block0[ 4]; w1_t[1] = block0[ 5]; w1_t[2] = block0[ 6]; w1_t[3] = block0[ 7]; - u32 w2_t[4]; - w2_t[0] = block0[ 8]; w2_t[1] = block0[ 9]; w2_t[2] = block0[10]; w2_t[3] = block0[11]; - u32 w3_t[4]; - w3_t[0] = block0[12]; w3_t[1] = block0[13]; w3_t[2] = pw_salt_len * 8; @@ -3390,12 +3463,12 @@ static void m11400s_0_0 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p // md5 - u32 tmp2; + u32x tmp2; - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32x a = MD5M_A; + u32x b = MD5M_B; + u32x c = MD5M_C; + u32x d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); @@ -3583,10 +3656,10 @@ static void m11400s_0_0 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p c += MD5M_C; d += MD5M_D; - u32 r_a = a; - u32 r_b = b; - u32 r_c = c; - u32 r_d = d; + u32x r_a = a; + u32x r_b = b; + u32x r_c = c; + u32x r_d = d; // 2nd transform @@ -3683,12 +3756,7 @@ static void m11400s_0_0 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p c += r_c; d += r_d; - const u32 r0 = a; - const u32 r1 = d; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_S + COMPARE_S_SIMD (a, d, c, b); } } @@ -3830,11 +3898,11 @@ static void m11400s_0_1 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; /* * HA1 = md5 ($salt . $pass) @@ -3842,7 +3910,7 @@ static void m11400s_0_1 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p // append the pass to the salt - u32 block0[16]; + u32x block0[16]; block0[ 0] = salt_buf0[ 0]; block0[ 1] = salt_buf0[ 1]; @@ -3861,7 +3929,7 @@ static void m11400s_0_1 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p block0[14] = salt_buf0[14]; block0[15] = salt_buf0[15]; - u32 block1[16]; + u32x block1[16]; block1[ 0] = salt_buf1[ 0]; block1[ 1] = salt_buf1[ 1]; @@ -3880,31 +3948,48 @@ static void m11400s_0_1 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p block1[14] = salt_buf1[14]; block1[15] = salt_buf1[15]; - memcat32 (block0, block1, salt_len, w0, w1, w2, w3, pw_len); + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; - u32 w0_t[4]; + w0_t[0] = w0lr; + w0_t[1] = w0[1]; + w0_t[2] = w0[2]; + w0_t[3] = w0[3]; + + w1_t[0] = w1[0]; + w1_t[1] = w1[1]; + w1_t[2] = w1[2]; + w1_t[3] = w1[3]; + + w2_t[0] = w2[0]; + w2_t[1] = w2[1]; + w2_t[2] = w2[2]; + w2_t[3] = w2[3]; + + w3_t[0] = w3[0]; + w3_t[1] = w3[1]; + w3_t[2] = w3[2]; + w3_t[3] = w3[3]; + + memcat32 (block0, block1, salt_len, w0_t, w1_t, w2_t, w3_t, pw_len); w0_t[0] = block0[ 0]; w0_t[1] = block0[ 1]; w0_t[2] = block0[ 2]; w0_t[3] = block0[ 3]; - u32 w1_t[4]; - w1_t[0] = block0[ 4]; w1_t[1] = block0[ 5]; w1_t[2] = block0[ 6]; w1_t[3] = block0[ 7]; - u32 w2_t[4]; - w2_t[0] = block0[ 8]; w2_t[1] = block0[ 9]; w2_t[2] = block0[10]; w2_t[3] = block0[11]; - u32 w3_t[4]; - w3_t[0] = block0[12]; w3_t[1] = block0[13]; w3_t[2] = pw_salt_len * 8; @@ -3912,12 +3997,12 @@ static void m11400s_0_1 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p // md5 - u32 tmp2; + u32x tmp2; - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32x a = MD5M_A; + u32x b = MD5M_B; + u32x c = MD5M_C; + u32x d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); @@ -4105,10 +4190,10 @@ static void m11400s_0_1 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p c += MD5M_C; d += MD5M_D; - u32 r_a = a; - u32 r_b = b; - u32 r_c = c; - u32 r_d = d; + u32x r_a = a; + u32x r_b = b; + u32x r_c = c; + u32x r_d = d; // 2nd transform @@ -4305,12 +4390,7 @@ static void m11400s_0_1 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p c += r_c; d += r_d; - const u32 r0 = a; - const u32 r1 = d; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_S + COMPARE_S_SIMD (a, d, c, b); } } @@ -4433,11 +4513,11 @@ static void m11400s_1_0 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; /* * HA1 = md5 ($salt . $pass) @@ -4445,7 +4525,7 @@ static void m11400s_1_0 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p // append the pass to the salt - u32 block0[16]; + u32x block0[16]; block0[ 0] = salt_buf0[ 0]; block0[ 1] = salt_buf0[ 1]; @@ -4464,7 +4544,7 @@ static void m11400s_1_0 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p block0[14] = salt_buf0[14]; block0[15] = salt_buf0[15]; - u32 block1[16]; + u32x block1[16]; block1[ 0] = salt_buf1[ 0]; block1[ 1] = salt_buf1[ 1]; @@ -4483,31 +4563,48 @@ static void m11400s_1_0 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p block1[14] = salt_buf1[14]; block1[15] = salt_buf1[15]; - memcat32 (block0, block1, salt_len, w0, w1, w2, w3, pw_len); + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; - u32 w0_t[4]; + w0_t[0] = w0lr; + w0_t[1] = w0[1]; + w0_t[2] = w0[2]; + w0_t[3] = w0[3]; + + w1_t[0] = w1[0]; + w1_t[1] = w1[1]; + w1_t[2] = w1[2]; + w1_t[3] = w1[3]; + + w2_t[0] = w2[0]; + w2_t[1] = w2[1]; + w2_t[2] = w2[2]; + w2_t[3] = w2[3]; + + w3_t[0] = w3[0]; + w3_t[1] = w3[1]; + w3_t[2] = w3[2]; + w3_t[3] = w3[3]; + + memcat32 (block0, block1, salt_len, w0_t, w1_t, w2_t, w3_t, pw_len); w0_t[0] = block0[ 0]; w0_t[1] = block0[ 1]; w0_t[2] = block0[ 2]; w0_t[3] = block0[ 3]; - u32 w1_t[4]; - w1_t[0] = block0[ 4]; w1_t[1] = block0[ 5]; w1_t[2] = block0[ 6]; w1_t[3] = block0[ 7]; - u32 w2_t[4]; - w2_t[0] = block0[ 8]; w2_t[1] = block0[ 9]; w2_t[2] = block0[10]; w2_t[3] = block0[11]; - u32 w3_t[4]; - w3_t[0] = block0[12]; w3_t[1] = block0[13]; w3_t[2] = block0[14]; @@ -4515,12 +4612,12 @@ static void m11400s_1_0 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p // md5 - u32 tmp2; + u32x tmp2; - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32x a = MD5M_A; + u32x b = MD5M_B; + u32x c = MD5M_C; + u32x d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); @@ -4595,10 +4692,10 @@ static void m11400s_1_0 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p c += MD5M_C; d += MD5M_D; - u32 r_a = a; - u32 r_b = b; - u32 r_c = c; - u32 r_d = d; + u32x r_a = a; + u32x r_b = b; + u32x r_c = c; + u32x r_d = d; w0_t[0] = block1[ 0]; w0_t[1] = block1[ 1]; @@ -4906,12 +5003,7 @@ static void m11400s_1_0 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p c += r_c; d += r_d; - const u32 r0 = a; - const u32 r1 = d; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_S + COMPARE_S_SIMD (a, d, c, b); } } @@ -5053,11 +5145,11 @@ static void m11400s_1_1 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; /* * HA1 = md5 ($salt . $pass) @@ -5065,7 +5157,7 @@ static void m11400s_1_1 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p // append the pass to the salt - u32 block0[16]; + u32x block0[16]; block0[ 0] = salt_buf0[ 0]; block0[ 1] = salt_buf0[ 1]; @@ -5084,7 +5176,7 @@ static void m11400s_1_1 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p block0[14] = salt_buf0[14]; block0[15] = salt_buf0[15]; - u32 block1[16]; + u32x block1[16]; block1[ 0] = salt_buf1[ 0]; block1[ 1] = salt_buf1[ 1]; @@ -5103,31 +5195,48 @@ static void m11400s_1_1 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p block1[14] = salt_buf1[14]; block1[15] = salt_buf1[15]; - memcat32 (block0, block1, salt_len, w0, w1, w2, w3, pw_len); + u32x w0_t[4]; + u32x w1_t[4]; + u32x w2_t[4]; + u32x w3_t[4]; - u32 w0_t[4]; + w0_t[0] = w0lr; + w0_t[1] = w0[1]; + w0_t[2] = w0[2]; + w0_t[3] = w0[3]; + + w1_t[0] = w1[0]; + w1_t[1] = w1[1]; + w1_t[2] = w1[2]; + w1_t[3] = w1[3]; + + w2_t[0] = w2[0]; + w2_t[1] = w2[1]; + w2_t[2] = w2[2]; + w2_t[3] = w2[3]; + + w3_t[0] = w3[0]; + w3_t[1] = w3[1]; + w3_t[2] = w3[2]; + w3_t[3] = w3[3]; + + memcat32 (block0, block1, salt_len, w0_t, w1_t, w2_t, w3_t, pw_len); w0_t[0] = block0[ 0]; w0_t[1] = block0[ 1]; w0_t[2] = block0[ 2]; w0_t[3] = block0[ 3]; - u32 w1_t[4]; - w1_t[0] = block0[ 4]; w1_t[1] = block0[ 5]; w1_t[2] = block0[ 6]; w1_t[3] = block0[ 7]; - u32 w2_t[4]; - w2_t[0] = block0[ 8]; w2_t[1] = block0[ 9]; w2_t[2] = block0[10]; w2_t[3] = block0[11]; - u32 w3_t[4]; - w3_t[0] = block0[12]; w3_t[1] = block0[13]; w3_t[2] = block0[14]; @@ -5135,12 +5244,12 @@ static void m11400s_1_1 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p // md5 - u32 tmp2; + u32x tmp2; - u32 a = MD5M_A; - u32 b = MD5M_B; - u32 c = MD5M_C; - u32 d = MD5M_D; + u32x a = MD5M_A; + u32x b = MD5M_B; + u32x c = MD5M_C; + u32x d = MD5M_D; MD5_STEP (MD5_Fo, a, b, c, d, w0_t[0], MD5C00, MD5S00); MD5_STEP (MD5_Fo, d, a, b, c, w0_t[1], MD5C01, MD5S01); @@ -5215,10 +5324,10 @@ static void m11400s_1_1 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p c += MD5M_C; d += MD5M_D; - u32 r_a = a; - u32 r_b = b; - u32 r_c = c; - u32 r_d = d; + u32x r_a = a; + u32x r_b = b; + u32x r_c = c; + u32x r_d = d; w0_t[0] = block1[ 0]; w0_t[1] = block1[ 1]; @@ -5626,12 +5735,7 @@ static void m11400s_1_1 (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 p c += r_c; d += r_d; - const u32 r0 = a; - const u32 r1 = d; - const u32 r2 = c; - const u32 r3 = b; - - #include COMPARE_S + COMPARE_S_SIMD (a, d, c, b); } } diff --git a/OpenCL/m11500_a1.cl b/OpenCL/m11500_a1.cl index 1fbc90aff..c537c0eec 100644 --- a/OpenCL/m11500_a1.cl +++ b/OpenCL/m11500_a1.cl @@ -175,7 +175,7 @@ __kernel void m11500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -218,7 +218,7 @@ __kernel void m11500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w_t[16]; @@ -308,7 +308,7 @@ __kernel void m11500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -365,7 +365,7 @@ __kernel void m11500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w_t[16]; diff --git a/OpenCL/m11500_a3.cl b/OpenCL/m11500_a3.cl index 13efe12ab..166dd2413 100644 --- a/OpenCL/m11500_a3.cl +++ b/OpenCL/m11500_a3.cl @@ -5,6 +5,8 @@ #define _CRC32_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,9 +18,7 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" - -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" +#include "OpenCL/simd.c" __constant u32 crc32tab[0x100] = { @@ -88,22 +88,30 @@ __constant u32 crc32tab[0x100] = 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d }; -static u32 round_crc32 (u32 a, const u32 v) +static u32x round_crc32 (u32x a, const u32x v) { - const u32 k = (a ^ v) & 0xff; + const u32x k = (a ^ v) & 0xff; - const u32 s = a >> 8; + const u32x s = a >> 8; - a = crc32tab[k]; + #if VECT_SIZE == 1 + a = (u32x) crc32tab[k]; + #elif VECT_SIZE == 2 + a = (u32x) (crc32tab[k.s0], crc32tab[k.s1]); + #elif VECT_SIZE == 4 + a = (u32x) (crc32tab[k.s0], crc32tab[k.s1], crc32tab[k.s2], crc32tab[k.s3]); + #elif VECT_SIZE == 8 + a = (u32x) (crc32tab[k.s0], crc32tab[k.s1], crc32tab[k.s2], crc32tab[k.s3], crc32tab[k.s4], crc32tab[k.s5], crc32tab[k.s6], crc32tab[k.s7]); + #endif a ^= s; return a; } -static u32 crc32 (const u32 w[16], const u32 pw_len, const u32 iv) +static u32x crc32 (const u32x w[16], const u32 pw_len, const u32 iv) { - u32 a = iv ^ ~0; + u32x a = iv ^ ~0; if (pw_len >= 1) a = round_crc32 (a, w[0] >> 0); if (pw_len >= 2) a = round_crc32 (a, w[0] >> 8); @@ -121,7 +129,7 @@ static u32 crc32 (const u32 w[16], const u32 pw_len, const u32 iv) return ~a; } -static void m11500m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m11500m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -142,13 +150,13 @@ static void m11500m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - u32 w_t[16]; + u32x w_t[16]; w_t[ 0] = w0; w_t[ 1] = w[ 1]; @@ -167,19 +175,16 @@ static void m11500m (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k w_t[14] = w[14]; w_t[15] = w[15]; - u32 a = crc32 (w_t, pw_len, iv); - u32 b = 0; + u32x a = crc32 (w_t, pw_len, iv); + u32x b = 0; + u32x c = 0; + u32x d = 0; - const u32 r0 = a; - const u32 r1 = b; - const u32 r2 = 0; - const u32 r3 = 0; - - #include COMPARE_M + COMPARE_M_SIMD (a, b, c, d); } } -static void m11500s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) +static void m11500s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset) { /** * modifier @@ -208,13 +213,13 @@ static void m11500s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = words_buf_r[il_pos]; + const u32x w0r = words_buf_r[il_pos / VECT_SIZE]; - const u32 w0 = w0l | w0r; + const u32x w0 = w0l | w0r; - u32 w_t[16]; + u32x w_t[16]; w_t[ 0] = w0; w_t[ 1] = w[ 1]; @@ -233,19 +238,16 @@ static void m11500s (u32 w[16], const u32 pw_len, __global pw_t *pws, __global k w_t[14] = w[14]; w_t[15] = w[15]; - u32 a = crc32 (w_t, pw_len, iv); - u32 b = 0; + u32x a = crc32 (w_t, pw_len, iv); + u32x b = 0; + u32x c = 0; + u32x d = 0; - const u32 r0 = a; - const u32 r1 = b; - const u32 r2 = 0; - const u32 r3 = 0; - - #include COMPARE_S + COMPARE_S_SIMD (a, b, c, d); } } -__kernel void m11500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m11500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -283,7 +285,7 @@ __kernel void m11500_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m11500m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m11500_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m11500_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -321,7 +323,7 @@ __kernel void m11500_m08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m11500m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m11500_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m11500_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -359,7 +361,7 @@ __kernel void m11500_m16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m11500m (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m11500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m11500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -397,7 +399,7 @@ __kernel void m11500_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m11500s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m11500_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m11500_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base @@ -435,7 +437,7 @@ __kernel void m11500_s08 (__global pw_t *pws, __global kernel_rule_t *rules_buf, m11500s (w, pw_len, pws, rules_buf, combs_buf, words_buf_r, tmps, hooks, bitmaps_buf_s1_a, bitmaps_buf_s1_b, bitmaps_buf_s1_c, bitmaps_buf_s1_d, bitmaps_buf_s2_a, bitmaps_buf_s2_b, bitmaps_buf_s2_c, bitmaps_buf_s2_d, plains_buf, digests_buf, hashes_shown, salt_bufs, esalt_bufs, d_return_buf, d_scryptV_buf, bitmap_mask, bitmap_shift1, bitmap_shift2, salt_pos, loop_pos, loop_cnt, bfs_cnt, digests_cnt, digests_offset); } -__kernel void m11500_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32 * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) +__kernel void m11500_s16 (__global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __constant u32x * words_buf_r, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, const u32 combs_mode, const u32 gid_max) { /** * base diff --git a/OpenCL/m11700_a1.cl b/OpenCL/m11700_a1.cl index 9de15619f..e87a0c177 100644 --- a/OpenCL/m11700_a1.cl +++ b/OpenCL/m11700_a1.cl @@ -2352,7 +2352,7 @@ __kernel void m11700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -2395,7 +2395,7 @@ __kernel void m11700_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w[16]; @@ -2571,7 +2571,7 @@ __kernel void m11700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -2614,7 +2614,7 @@ __kernel void m11700_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w[16]; diff --git a/OpenCL/m11700_a3.cl b/OpenCL/m11700_a3.cl index 5d9eaa17a..1dbc959fe 100644 --- a/OpenCL/m11700_a3.cl +++ b/OpenCL/m11700_a3.cl @@ -5,6 +5,8 @@ #define _GOST2012_256_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,21 +18,29 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" - -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" +#include "OpenCL/simd.c" #define INITVAL 0x0101010101010101 -#define SBOG_LPSti64 \ - s_sbob_sl64[0][(t[0] >> (i * 8)) & 0xff] ^ \ - s_sbob_sl64[1][(t[1] >> (i * 8)) & 0xff] ^ \ - s_sbob_sl64[2][(t[2] >> (i * 8)) & 0xff] ^ \ - s_sbob_sl64[3][(t[3] >> (i * 8)) & 0xff] ^ \ - s_sbob_sl64[4][(t[4] >> (i * 8)) & 0xff] ^ \ - s_sbob_sl64[5][(t[5] >> (i * 8)) & 0xff] ^ \ - s_sbob_sl64[6][(t[6] >> (i * 8)) & 0xff] ^ \ - s_sbob_sl64[7][(t[7] >> (i * 8)) & 0xff] +#if VECT_SIZE == 1 +#define BOX(S,n,i) (S)[(n)][(i)] +#elif VECT_SIZE == 2 +#define BOX(S,n,i) (u64x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1]) +#elif VECT_SIZE == 4 +#define BOX(S,n,i) (u64x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3]) +#elif VECT_SIZE == 8 +#define BOX(S,n,i) (u64x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7]) +#endif + +#define SBOG_LPSti64 \ + BOX (s_sbob_sl64, 0, ((t[0] >> (i * 8)) & 0xff)) ^ \ + BOX (s_sbob_sl64, 1, ((t[1] >> (i * 8)) & 0xff)) ^ \ + BOX (s_sbob_sl64, 2, ((t[2] >> (i * 8)) & 0xff)) ^ \ + BOX (s_sbob_sl64, 3, ((t[3] >> (i * 8)) & 0xff)) ^ \ + BOX (s_sbob_sl64, 4, ((t[4] >> (i * 8)) & 0xff)) ^ \ + BOX (s_sbob_sl64, 5, ((t[5] >> (i * 8)) & 0xff)) ^ \ + BOX (s_sbob_sl64, 6, ((t[6] >> (i * 8)) & 0xff)) ^ \ + BOX (s_sbob_sl64, 7, ((t[7] >> (i * 8)) & 0xff)) // constants @@ -2226,11 +2236,11 @@ __constant u64 sbob_rc64[12][8] = }, }; -static void streebog_g (u64 h[8], const u64 m[8], __local u64 s_sbob_sl64[8][256]) +static void streebog_g (u64x h[8], const u64x m[8], __local u64 s_sbob_sl64[8][256]) { - u64 k[8]; - u64 s[8]; - u64 t[8]; + u64x k[8]; + u64x s[8]; + u64x t[8]; #pragma unroll for (int i = 0; i < 8; i++) @@ -2297,17 +2307,17 @@ static void m11700m (__local u64 s_sbob_sl64[8][256], u32 w[16], const u32 pw_le u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; /** * reverse message block */ - u64 m[8]; + u64x m[8]; m[0] = hl32_to_64 (w[15], w[14]); m[1] = hl32_to_64 (w[13], w[12]); @@ -2316,7 +2326,7 @@ static void m11700m (__local u64 s_sbob_sl64[8][256], u32 w[16], const u32 pw_le m[4] = hl32_to_64 (w[ 7], w[ 6]); m[5] = hl32_to_64 (w[ 5], w[ 4]); m[6] = hl32_to_64 (w[ 3], w[ 2]); - m[7] = hl32_to_64 (w[ 1], w[ 0]); + m[7] = hl32_to_64 (w[ 1], w0lr ); m[0] = swap64 (m[0]); m[1] = swap64 (m[1]); @@ -2329,7 +2339,7 @@ static void m11700m (__local u64 s_sbob_sl64[8][256], u32 w[16], const u32 pw_le // state buffer (hash) - u64 h[8]; + u64x h[8]; h[0] = INITVAL; h[1] = INITVAL; @@ -2342,7 +2352,7 @@ static void m11700m (__local u64 s_sbob_sl64[8][256], u32 w[16], const u32 pw_le streebog_g (h, m, s_sbob_sl64); - u64 z[8]; + u64x z[8]; z[0] = 0; z[1] = 0; @@ -2356,12 +2366,12 @@ static void m11700m (__local u64 s_sbob_sl64[8][256], u32 w[16], const u32 pw_le streebog_g (h, z, s_sbob_sl64); streebog_g (h, m, s_sbob_sl64); - const u32 r0 = l32_from_64 (h[0]); - const u32 r1 = h32_from_64 (h[0]); - const u32 r2 = l32_from_64 (h[1]); - const u32 r3 = h32_from_64 (h[1]); + const u32x r0 = l32_from_64 (h[0]); + const u32x r1 = h32_from_64 (h[0]); + const u32x r2 = l32_from_64 (h[1]); + const u32x r3 = h32_from_64 (h[1]); - #include COMPARE_M + COMPARE_M_SIMD (r0, r1, r2, r3); } } @@ -2392,17 +2402,17 @@ static void m11700s (__local u64 s_sbob_sl64[8][256], u32 w[16], const u32 pw_le u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; /** * reverse message block */ - u64 m[8]; + u64x m[8]; m[0] = hl32_to_64 (w[15], w[14]); m[1] = hl32_to_64 (w[13], w[12]); @@ -2411,7 +2421,7 @@ static void m11700s (__local u64 s_sbob_sl64[8][256], u32 w[16], const u32 pw_le m[4] = hl32_to_64 (w[ 7], w[ 6]); m[5] = hl32_to_64 (w[ 5], w[ 4]); m[6] = hl32_to_64 (w[ 3], w[ 2]); - m[7] = hl32_to_64 (w[ 1], w[ 0]); + m[7] = hl32_to_64 (w[ 1], w0lr ); m[0] = swap64 (m[0]); m[1] = swap64 (m[1]); @@ -2424,7 +2434,7 @@ static void m11700s (__local u64 s_sbob_sl64[8][256], u32 w[16], const u32 pw_le // state buffer (hash) - u64 h[8]; + u64x h[8]; h[0] = INITVAL; h[1] = INITVAL; @@ -2437,7 +2447,7 @@ static void m11700s (__local u64 s_sbob_sl64[8][256], u32 w[16], const u32 pw_le streebog_g (h, m, s_sbob_sl64); - u64 z[8]; + u64x z[8]; z[0] = 0; z[1] = 0; @@ -2451,12 +2461,12 @@ static void m11700s (__local u64 s_sbob_sl64[8][256], u32 w[16], const u32 pw_le streebog_g (h, z, s_sbob_sl64); streebog_g (h, m, s_sbob_sl64); - const u32 r0 = l32_from_64 (h[0]); - const u32 r1 = h32_from_64 (h[0]); - const u32 r2 = l32_from_64 (h[1]); - const u32 r3 = h32_from_64 (h[1]); + const u32x r0 = l32_from_64 (h[0]); + const u32x r1 = h32_from_64 (h[0]); + const u32x r2 = l32_from_64 (h[1]); + const u32x r3 = h32_from_64 (h[1]); - #include COMPARE_S + COMPARE_S_SIMD (r0, r1, r2, r3); } } diff --git a/OpenCL/m11800_a1.cl b/OpenCL/m11800_a1.cl index 05b7c58b1..d89eaf992 100644 --- a/OpenCL/m11800_a1.cl +++ b/OpenCL/m11800_a1.cl @@ -2352,7 +2352,7 @@ __kernel void m11800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -2395,7 +2395,7 @@ __kernel void m11800_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w[16]; @@ -2571,7 +2571,7 @@ __kernel void m11800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -2614,7 +2614,7 @@ __kernel void m11800_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w[16]; diff --git a/OpenCL/m11800_a3.cl b/OpenCL/m11800_a3.cl index 3882d5928..07c7e8b73 100644 --- a/OpenCL/m11800_a3.cl +++ b/OpenCL/m11800_a3.cl @@ -5,6 +5,8 @@ #define _GOST2012_512_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,21 +18,29 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" - -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" +#include "OpenCL/simd.c" #define INITVAL 0 -#define SBOG_LPSti64 \ - s_sbob_sl64[0][(t[0] >> (i * 8)) & 0xff] ^ \ - s_sbob_sl64[1][(t[1] >> (i * 8)) & 0xff] ^ \ - s_sbob_sl64[2][(t[2] >> (i * 8)) & 0xff] ^ \ - s_sbob_sl64[3][(t[3] >> (i * 8)) & 0xff] ^ \ - s_sbob_sl64[4][(t[4] >> (i * 8)) & 0xff] ^ \ - s_sbob_sl64[5][(t[5] >> (i * 8)) & 0xff] ^ \ - s_sbob_sl64[6][(t[6] >> (i * 8)) & 0xff] ^ \ - s_sbob_sl64[7][(t[7] >> (i * 8)) & 0xff] +#if VECT_SIZE == 1 +#define BOX(S,n,i) (S)[(n)][(i)] +#elif VECT_SIZE == 2 +#define BOX(S,n,i) (u64x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1]) +#elif VECT_SIZE == 4 +#define BOX(S,n,i) (u64x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3]) +#elif VECT_SIZE == 8 +#define BOX(S,n,i) (u64x) ((S)[(n)][(i).s0], (S)[(n)][(i).s1], (S)[(n)][(i).s2], (S)[(n)][(i).s3], (S)[(n)][(i).s4], (S)[(n)][(i).s5], (S)[(n)][(i).s6], (S)[(n)][(i).s7]) +#endif + +#define SBOG_LPSti64 \ + BOX (s_sbob_sl64, 0, ((t[0] >> (i * 8)) & 0xff)) ^ \ + BOX (s_sbob_sl64, 1, ((t[1] >> (i * 8)) & 0xff)) ^ \ + BOX (s_sbob_sl64, 2, ((t[2] >> (i * 8)) & 0xff)) ^ \ + BOX (s_sbob_sl64, 3, ((t[3] >> (i * 8)) & 0xff)) ^ \ + BOX (s_sbob_sl64, 4, ((t[4] >> (i * 8)) & 0xff)) ^ \ + BOX (s_sbob_sl64, 5, ((t[5] >> (i * 8)) & 0xff)) ^ \ + BOX (s_sbob_sl64, 6, ((t[6] >> (i * 8)) & 0xff)) ^ \ + BOX (s_sbob_sl64, 7, ((t[7] >> (i * 8)) & 0xff)) // constants @@ -2226,11 +2236,11 @@ __constant u64 sbob_rc64[12][8] = }, }; -static void streebog_g (u64 h[8], const u64 m[8], __local u64 s_sbob_sl64[8][256]) +static void streebog_g (u64x h[8], const u64x m[8], __local u64 s_sbob_sl64[8][256]) { - u64 k[8]; - u64 s[8]; - u64 t[8]; + u64x k[8]; + u64x s[8]; + u64x t[8]; #pragma unroll for (int i = 0; i < 8; i++) @@ -2297,17 +2307,17 @@ static void m11800m (__local u64 s_sbob_sl64[8][256], u32 w[16], const u32 pw_le u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; /** * reverse message block */ - u64 m[8]; + u64x m[8]; m[0] = hl32_to_64 (w[15], w[14]); m[1] = hl32_to_64 (w[13], w[12]); @@ -2316,7 +2326,7 @@ static void m11800m (__local u64 s_sbob_sl64[8][256], u32 w[16], const u32 pw_le m[4] = hl32_to_64 (w[ 7], w[ 6]); m[5] = hl32_to_64 (w[ 5], w[ 4]); m[6] = hl32_to_64 (w[ 3], w[ 2]); - m[7] = hl32_to_64 (w[ 1], w[ 0]); + m[7] = hl32_to_64 (w[ 1], w0lr ); m[0] = swap64 (m[0]); m[1] = swap64 (m[1]); @@ -2329,7 +2339,7 @@ static void m11800m (__local u64 s_sbob_sl64[8][256], u32 w[16], const u32 pw_le // state buffer (hash) - u64 h[8]; + u64x h[8]; h[0] = INITVAL; h[1] = INITVAL; @@ -2342,7 +2352,7 @@ static void m11800m (__local u64 s_sbob_sl64[8][256], u32 w[16], const u32 pw_le streebog_g (h, m, s_sbob_sl64); - u64 z[8]; + u64x z[8]; z[0] = 0; z[1] = 0; @@ -2356,12 +2366,12 @@ static void m11800m (__local u64 s_sbob_sl64[8][256], u32 w[16], const u32 pw_le streebog_g (h, z, s_sbob_sl64); streebog_g (h, m, s_sbob_sl64); - const u32 r0 = l32_from_64 (h[0]); - const u32 r1 = h32_from_64 (h[0]); - const u32 r2 = l32_from_64 (h[1]); - const u32 r3 = h32_from_64 (h[1]); + const u32x r0 = l32_from_64 (h[0]); + const u32x r1 = h32_from_64 (h[0]); + const u32x r2 = l32_from_64 (h[1]); + const u32x r3 = h32_from_64 (h[1]); - #include COMPARE_M + COMPARE_M_SIMD (r0, r1, r2, r3); } } @@ -2392,17 +2402,17 @@ static void m11800s (__local u64 s_sbob_sl64[8][256], u32 w[16], const u32 pw_le u32 w0l = w[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; /** * reverse message block */ - u64 m[8]; + u64x m[8]; m[0] = hl32_to_64 (w[15], w[14]); m[1] = hl32_to_64 (w[13], w[12]); @@ -2411,7 +2421,7 @@ static void m11800s (__local u64 s_sbob_sl64[8][256], u32 w[16], const u32 pw_le m[4] = hl32_to_64 (w[ 7], w[ 6]); m[5] = hl32_to_64 (w[ 5], w[ 4]); m[6] = hl32_to_64 (w[ 3], w[ 2]); - m[7] = hl32_to_64 (w[ 1], w[ 0]); + m[7] = hl32_to_64 (w[ 1], w0lr ); m[0] = swap64 (m[0]); m[1] = swap64 (m[1]); @@ -2424,7 +2434,7 @@ static void m11800s (__local u64 s_sbob_sl64[8][256], u32 w[16], const u32 pw_le // state buffer (hash) - u64 h[8]; + u64x h[8]; h[0] = INITVAL; h[1] = INITVAL; @@ -2437,7 +2447,7 @@ static void m11800s (__local u64 s_sbob_sl64[8][256], u32 w[16], const u32 pw_le streebog_g (h, m, s_sbob_sl64); - u64 z[8]; + u64x z[8]; z[0] = 0; z[1] = 0; @@ -2451,12 +2461,12 @@ static void m11800s (__local u64 s_sbob_sl64[8][256], u32 w[16], const u32 pw_le streebog_g (h, z, s_sbob_sl64); streebog_g (h, m, s_sbob_sl64); - const u32 r0 = l32_from_64 (h[0]); - const u32 r1 = h32_from_64 (h[0]); - const u32 r2 = l32_from_64 (h[1]); - const u32 r3 = h32_from_64 (h[1]); + const u32x r0 = l32_from_64 (h[0]); + const u32x r1 = h32_from_64 (h[0]); + const u32x r2 = l32_from_64 (h[1]); + const u32x r3 = h32_from_64 (h[1]); - #include COMPARE_S + COMPARE_S_SIMD (r0, r1, r2, r3); } } diff --git a/OpenCL/m12600_a1.cl b/OpenCL/m12600_a1.cl index c77c2cdd9..80485c679 100644 --- a/OpenCL/m12600_a1.cl +++ b/OpenCL/m12600_a1.cl @@ -89,7 +89,7 @@ __kernel void m12600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -149,7 +149,7 @@ __kernel void m12600_m04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordr0, wordr1, pw_r_len); - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; @@ -516,7 +516,7 @@ __kernel void m12600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordl0, wordl1, pw_l_len); - switch_buffer_by_offset (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); + switch_buffer_by_offset_le (wordl0, wordl1, wordl2, wordl3, combs_buf[0].pw_len); } /** @@ -588,7 +588,7 @@ __kernel void m12600_s04 (__global pw_t *pws, __global kernel_rule_t *rules_buf, { append_0x80_2x4 (wordr0, wordr1, pw_r_len); - switch_buffer_by_offset (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le (wordr0, wordr1, wordr2, wordr3, pw_l_len); } u32 w0[4]; diff --git a/OpenCL/m12600_a3.cl b/OpenCL/m12600_a3.cl index e127b727c..ddfe3c745 100644 --- a/OpenCL/m12600_a3.cl +++ b/OpenCL/m12600_a3.cl @@ -5,6 +5,8 @@ #define _SHA256_SHA1_ +#define NEW_SIMD_CODE + #include "include/constants.h" #include "include/kernel_vendor.h" @@ -16,11 +18,17 @@ #include "include/kernel_functions.c" #include "OpenCL/types_ocl.c" #include "OpenCL/common.c" +#include "OpenCL/simd.c" -#define COMPARE_S "OpenCL/check_single_comp4.c" -#define COMPARE_M "OpenCL/check_multi_comp4.c" - -#define uint_to_hex_upper8(i) l_bin2asc[(i)] +#if VECT_SIZE == 1 +#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i)]) +#elif VECT_SIZE == 2 +#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1]) +#elif VECT_SIZE == 4 +#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3]) +#elif VECT_SIZE == 8 +#define uint_to_hex_upper8(i) (u32x) (l_bin2asc[(i).s0], l_bin2asc[(i).s1], l_bin2asc[(i).s2], l_bin2asc[(i).s3], l_bin2asc[(i).s4], l_bin2asc[(i).s5], l_bin2asc[(i).s6], l_bin2asc[(i).s7]) +#endif static void m12600m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_len, __global pw_t *pws, __global kernel_rule_t *rules_buf, __global comb_t *combs_buf, __global bf_t *bfs_buf, __global void *tmps, __global void *hooks, __global u32 *bitmaps_buf_s1_a, __global u32 *bitmaps_buf_s1_b, __global u32 *bitmaps_buf_s1_c, __global u32 *bitmaps_buf_s1_d, __global u32 *bitmaps_buf_s2_a, __global u32 *bitmaps_buf_s2_b, __global u32 *bitmaps_buf_s2_c, __global u32 *bitmaps_buf_s2_d, __global plain_t *plains_buf, __global digest_t *digests_buf, __global u32 *hashes_shown, __global salt_t *salt_bufs, __global void *esalt_bufs, __global u32 *d_return_buf, __global u32 *d_scryptV_buf, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2, const u32 salt_pos, const u32 loop_pos, const u32 loop_cnt, const u32 bfs_cnt, const u32 digests_cnt, const u32 digests_offset, __local u32 l_bin2asc[256]) { @@ -52,41 +60,41 @@ static void m12600m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; /** * sha1 */ - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = 0; - u32 wf_t = pw_len * 8; + u32x w0_t = w0lr; + u32x w1_t = w0[1]; + u32x w2_t = w0[2]; + u32x w3_t = w0[3]; + u32x w4_t = w1[0]; + u32x w5_t = w1[1]; + u32x w6_t = w1[2]; + u32x w7_t = w1[3]; + u32x w8_t = w2[0]; + u32x w9_t = w2[1]; + u32x wa_t = w2[2]; + u32x wb_t = w2[3]; + u32x wc_t = w3[0]; + u32x wd_t = w3[1]; + u32x we_t = 0; + u32x wf_t = pw_len * 8; - u32 a = SHA1M_A; - u32 b = SHA1M_B; - u32 c = SHA1M_C; - u32 d = SHA1M_D; - u32 e = SHA1M_E; - u32 f = 0; - u32 g = 0; - u32 h = 0; + u32x a = SHA1M_A; + u32x b = SHA1M_B; + u32x c = SHA1M_C; + u32x d = SHA1M_D; + u32x e = SHA1M_E; + u32x f = 0; + u32x g = 0; + u32x h = 0; #undef K #define K SHA1C00 @@ -309,12 +317,7 @@ static void m12600m (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e); wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f); - const u32 r0 = d; - const u32 r1 = h; - const u32 r2 = c; - const u32 r3 = g; - - #include COMPARE_M + COMPARE_M_SIMD (d, h, c, g); } } @@ -360,41 +363,41 @@ static void m12600s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le u32 w0l = w0[0]; - for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos++) + for (u32 il_pos = 0; il_pos < bfs_cnt; il_pos += VECT_SIZE) { - const u32 w0r = bfs_buf[il_pos].i; + const u32x w0r = w0r_create_bft (bfs_buf, il_pos); - w0[0] = w0l | w0r; + const u32x w0lr = w0l | w0r; /** * sha1 */ - u32 w0_t = w0[0]; - u32 w1_t = w0[1]; - u32 w2_t = w0[2]; - u32 w3_t = w0[3]; - u32 w4_t = w1[0]; - u32 w5_t = w1[1]; - u32 w6_t = w1[2]; - u32 w7_t = w1[3]; - u32 w8_t = w2[0]; - u32 w9_t = w2[1]; - u32 wa_t = w2[2]; - u32 wb_t = w2[3]; - u32 wc_t = w3[0]; - u32 wd_t = w3[1]; - u32 we_t = 0; - u32 wf_t = pw_len * 8; + u32x w0_t = w0lr; + u32x w1_t = w0[1]; + u32x w2_t = w0[2]; + u32x w3_t = w0[3]; + u32x w4_t = w1[0]; + u32x w5_t = w1[1]; + u32x w6_t = w1[2]; + u32x w7_t = w1[3]; + u32x w8_t = w2[0]; + u32x w9_t = w2[1]; + u32x wa_t = w2[2]; + u32x wb_t = w2[3]; + u32x wc_t = w3[0]; + u32x wd_t = w3[1]; + u32x we_t = 0; + u32x wf_t = pw_len * 8; - u32 a = SHA1M_A; - u32 b = SHA1M_B; - u32 c = SHA1M_C; - u32 d = SHA1M_D; - u32 e = SHA1M_E; - u32 f = 0; - u32 g = 0; - u32 h = 0; + u32x a = SHA1M_A; + u32x b = SHA1M_B; + u32x c = SHA1M_C; + u32x d = SHA1M_D; + u32x e = SHA1M_E; + u32x f = 0; + u32x g = 0; + u32x h = 0; #undef K #define K SHA1C00 @@ -617,12 +620,7 @@ static void m12600s (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 pw_le we_t = SHA256_EXPAND (wc_t, w7_t, wf_t, we_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, c, d, e, f, g, h, a, b, we_t, SHA256C3e); wf_t = SHA256_EXPAND (wd_t, w8_t, w0_t, wf_t); SHA256_STEP (SHA256_F0o, SHA256_F1o, b, c, d, e, f, g, h, a, wf_t, SHA256C3f); - const u32 r0 = d; - const u32 r1 = h; - const u32 r2 = c; - const u32 r3 = g; - - #include COMPARE_S + COMPARE_S_SIMD (d, h, c, g); } } diff --git a/OpenCL/simd.c b/OpenCL/simd.c index 2599b3fd2..dc9dba869 100644 --- a/OpenCL/simd.c +++ b/OpenCL/simd.c @@ -588,3 +588,20 @@ #define MATCHES_NONE_VV(a,b) !(MATCHES_ONE_VV ((a), (b))) #define MATCHES_NONE_VS(a,b) !(MATCHES_ONE_VS ((a), (b))) + +// attack-mode 0 + +static inline u32x w0r_create_bft (__global bf_t *bfs_buf, const u32 il_pos) +{ + #if VECT_SIZE == 1 + const u32x w0r = (u32x) (bfs_buf[il_pos + 0].i); + #elif VECT_SIZE == 2 + const u32x w0r = (u32x) (bfs_buf[il_pos + 0].i, bfs_buf[il_pos + 1].i); + #elif VECT_SIZE == 4 + const u32x w0r = (u32x) (bfs_buf[il_pos + 0].i, bfs_buf[il_pos + 1].i, bfs_buf[il_pos + 2].i, bfs_buf[il_pos + 3].i); + #elif VECT_SIZE == 8 + const u32x w0r = (u32x) (bfs_buf[il_pos + 0].i, bfs_buf[il_pos + 1].i, bfs_buf[il_pos + 2].i, bfs_buf[il_pos + 3].i, bfs_buf[il_pos + 4].i, bfs_buf[il_pos + 5].i, bfs_buf[il_pos + 6].i, bfs_buf[il_pos + 7].i); + #endif + + return w0r; +} diff --git a/OpenCL/types_ocl.c b/OpenCL/types_ocl.c index 137f67e09..e1dac4aaa 100644 --- a/OpenCL/types_ocl.c +++ b/OpenCL/types_ocl.c @@ -3,6 +3,9 @@ * License.....: MIT */ +#define DEVICE_TYPE_CPU 2 +#define DEVICE_TYPE_GPU 4 + typedef uchar u8; typedef ushort u16; typedef uint u32; @@ -14,63 +17,403 @@ typedef ulong u64; #endif #if VECT_SIZE == 1 -typedef uint u32x; -typedef ulong u64x; +typedef uchar u8x; +typedef ushort u16x; +typedef uint u32x; +typedef ulong u64x; #endif #if VECT_SIZE == 2 -typedef uint2 u32x; -typedef ulong2 u64x; +typedef uchar2 u8x; +typedef ushort2 u16x; +typedef uint2 u32x; +typedef ulong2 u64x; #endif #if VECT_SIZE == 4 -typedef uint4 u32x; -typedef ulong4 u64x; +typedef uchar4 u8x; +typedef ushort4 u16x; +typedef uint4 u32x; +typedef ulong4 u64x; #endif #if VECT_SIZE == 8 -typedef uint8 u32x; -typedef ulong8 u64x; +typedef uchar8 u8x; +typedef ushort8 u16x; +typedef uint8 u32x; +typedef ulong8 u64x; #endif // this one needs to die #define allx(r) r -static inline u32 l32_from_64 (u64 a) +static inline u32 l32_from_64_S (u64 a) { - const u32 r = (uint) (a); + const u32 r = (u32) (a); return r; } -static inline u32 h32_from_64 (u64 a) +static inline u32 h32_from_64_S (u64 a) { a >>= 32; - const u32 r = (uint) (a); + const u32 r = (u32) (a); return r; } -static inline u64 hl32_to_64 (const u32 a, const u32 b) +static inline u64 hl32_to_64_S (const u32 a, const u32 b) { return as_ulong ((uint2) (b, a)); } +static inline u32x l32_from_64 (u64x a) +{ + u32x r; + + #if VECT_SIZE == 1 + r = (u32) a; + #endif + + #if VECT_SIZE >= 2 + r.s0 = (u32) a.s0; + r.s1 = (u32) a.s1; + #endif + + #if VECT_SIZE >= 4 + r.s2 = (u32) a.s2; + r.s3 = (u32) a.s3; + #endif + + #if VECT_SIZE >= 8 + r.s4 = (u32) a.s4; + r.s5 = (u32) a.s5; + r.s6 = (u32) a.s6; + r.s7 = (u32) a.s7; + #endif + + return r; +} + +static inline u32x h32_from_64 (u64x a) +{ + a >>= 32; + + u32x r; + + #if VECT_SIZE == 1 + r = (u32) a; + #endif + + #if VECT_SIZE >= 2 + r.s0 = (u32) a.s0; + r.s1 = (u32) a.s1; + #endif + + #if VECT_SIZE >= 4 + r.s2 = (u32) a.s2; + r.s3 = (u32) a.s3; + #endif + + #if VECT_SIZE >= 8 + r.s4 = (u32) a.s4; + r.s5 = (u32) a.s5; + r.s6 = (u32) a.s6; + r.s7 = (u32) a.s7; + #endif + + return r; +} + +static inline u64x hl32_to_64 (const u32x a, const u32x b) +{ + u64x r; + + #if VECT_SIZE == 1 + r = as_ulong ((uint2) (b, a)); + #endif + + #if VECT_SIZE >= 2 + r.s0 = as_ulong ((uint2) (b.s0, a.s0)); + r.s1 = as_ulong ((uint2) (b.s1, a.s1)); + #endif + + #if VECT_SIZE >= 4 + r.s2 = as_ulong ((uint2) (b.s2, a.s2)); + r.s3 = as_ulong ((uint2) (b.s3, a.s3)); + #endif + + #if VECT_SIZE >= 8 + r.s4 = as_ulong ((uint2) (b.s4, a.s4)); + r.s5 = as_ulong ((uint2) (b.s5, a.s5)); + r.s6 = as_ulong ((uint2) (b.s6, a.s6)); + r.s7 = as_ulong ((uint2) (b.s7, a.s7)); + #endif + + return r; +} + #ifdef IS_AMD -static inline u32 swap32 (const u32 v) +static inline u32 swap32_S (const u32 v) { return (as_uint (as_uchar4 (v).s3210)); } -static inline u64 swap64 (const u64 v) +static inline u64 swap64_S (const u64 v) { return (as_ulong (as_uchar8 (v).s76543210)); } + +static inline u32 rotr32_S (const u32 a, const u32 n) +{ + return rotate (a, 32 - n); +} + +static inline u32 rotl32_S (const u32 a, const u32 n) +{ + return rotate (a, n); +} + +static inline u64 rotr64_S (const u64 a, const u32 n) +{ + u64 r; + + #if DEVICE_TYPE == DEVICE_TYPE_CPU + + r = rotate (a, (u64) 64 - n); + + #else + + uint2 a2 = as_uint2 (a); + + uint2 t; + + t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) + : amd_bitalign (a2.s1, a2.s0, n); + t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) + : amd_bitalign (a2.s0, a2.s1, n); + + r = as_ulong (t); + + #endif + + return r; +} + +static inline u64 rotl64_S (const u64 a, const u32 n) +{ + return rotr64_S (a, 64 - n); +} + +static inline u32x swap32 (const u32x v) +{ + return ((v >> 24) & 0x000000ff) + | ((v >> 8) & 0x0000ff00) + | ((v << 8) & 0x00ff0000) + | ((v << 24) & 0xff000000); +} + +static inline u64x swap64 (const u64x v) +{ + return ((v >> 56) & 0x00000000000000ff) + | ((v >> 40) & 0x000000000000ff00) + | ((v >> 24) & 0x0000000000ff0000) + | ((v >> 8) & 0x00000000ff000000) + | ((v << 8) & 0x000000ff00000000) + | ((v << 24) & 0x0000ff0000000000) + | ((v << 40) & 0x00ff000000000000) + | ((v << 56) & 0xff00000000000000); +} + +static inline u32x rotr32 (const u32x a, const u32 n) +{ + return rotate (a, 32 - n); +} + +static inline u32x rotl32 (const u32x a, const u32 n) +{ + return rotate (a, n); +} + +static inline u64x rotr64 (const u64x a, const u32 n) +{ + u64x r; + + #if DEVICE_TYPE == DEVICE_TYPE_CPU + + r = rotate (a, (u64) 64 - n); + + #else + + uint2 a2; + uint2 t; + + #if VECT_SIZE == 1 + + a2 = as_uint2 (a); + + t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n); + t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n); + + r = as_ulong (t); + + #elif VECT_SIZE == 2 + + { + a2 = as_uint2 (a.s0); + + t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n); + t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n); + + r.s0 = as_ulong (t); + } + + { + a2 = as_uint2 (a.s1); + + t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n); + t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n); + + r.s1 = as_ulong (t); + } + + #elif VECT_SIZE == 4 + + { + a2 = as_uint2 (a.s0); + + t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n); + t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n); + + r.s0 = as_ulong (t); + } + + { + a2 = as_uint2 (a.s1); + + t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n); + t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n); + + r.s1 = as_ulong (t); + } + + { + a2 = as_uint2 (a.s2); + + t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n); + t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n); + + r.s2 = as_ulong (t); + } + + { + a2 = as_uint2 (a.s3); + + t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n); + t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n); + + r.s3 = as_ulong (t); + } + + #elif VECT_SIZE == 8 + + { + a2 = as_uint2 (a.s0); + + t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n); + t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n); + + r.s0 = as_ulong (t); + } + + { + a2 = as_uint2 (a.s1); + + t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n); + t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n); + + r.s1 = as_ulong (t); + } + + { + a2 = as_uint2 (a.s2); + + t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n); + t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n); + + r.s2 = as_ulong (t); + } + + { + a2 = as_uint2 (a.s3); + + t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n); + t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n); + + r.s3 = as_ulong (t); + } + + { + a2 = as_uint2 (a.s4); + + t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n); + t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n); + + r.s4 = as_ulong (t); + } + + { + a2 = as_uint2 (a.s5); + + t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n); + t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n); + + r.s5 = as_ulong (t); + } + + { + a2 = as_uint2 (a.s6); + + t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n); + t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n); + + r.s6 = as_ulong (t); + } + + { + a2 = as_uint2 (a.s7); + + t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) : amd_bitalign (a2.s1, a2.s0, n); + t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) : amd_bitalign (a2.s0, a2.s1, n); + + r.s7 = as_ulong (t); + } + + #endif + #endif + + return r; +} + +static inline u64x rotl64 (const u64x a, const u32 n) +{ + return rotr64 (a, 64 - n); +} + +static inline u32 __bfe (const u32 a, const u32 b, const u32 c) +{ + return amd_bfe (a, b, c); +} + +static inline u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c) +{ + return amd_bytealign (a, b, c); +} #endif #ifdef IS_NV -static inline u32 swap32 (const u32 v) +static inline u32 swap32_S (const u32 v) { u32 r; @@ -79,7 +422,7 @@ static inline u32 swap32 (const u32 v) return r; } -static inline u64 swap64 (const u64 v) +static inline u64 swap64_S (const u64 v) { u32 il; u32 ir; @@ -98,34 +441,122 @@ static inline u64 swap64 (const u64 v) return r; } -#endif -#ifdef IS_GENERIC -static inline u32 swap32 (const u32 v) +static inline u32 rotr32_S (const u32 a, const u32 n) { - return (as_uint (as_uchar4 (v).s3210)); + return rotate (a, 32 - n); } -static inline u64 swap64 (const u64 v) +static inline u32 rotl32_S (const u32 a, const u32 n) { - return (as_ulong (as_uchar8 (v).s76543210)); + return rotate (a, n); +} + +#if CUDA_ARCH >= 350 +static inline u64 rotr64_S (const u64 a, const u32 n) +{ + u32 il; + u32 ir; + + asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a)); + + u32 tl; + u32 tr; + + if (n >= 32) + { + asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32)); + asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32)); + } + else + { + asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n)); + asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n)); + } + + u64 r; + + asm ("mov.b64 %0, {%1, %2};" : "=l"(r) : "r"(tl), "r"(tr)); + + return r; +} +#else +static inline u64 rotr64_S (const u64 a, const u32 n) +{ + return rotate (a, (u64) 64 - n); } #endif -#ifdef IS_AMD -static inline u32 __bfe (const u32 a, const u32 b, const u32 c) +static inline u64 rotl64_S (const u64 a, const u32 n) { - return amd_bfe (a, b, c); + return rotr64_S (a, 64 - n); } -static inline u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c) +#if CUDA_ARCH >= 500 +static inline u32 lut3_2d_S (const u32 a, const u32 b, const u32 c) { - return amd_bytealign (a, b, c); + u32 r; + + asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r) : "r" (a), "r" (b), "r" (c)); + + return r; } +static inline u32 lut3_39_S (const u32 a, const u32 b, const u32 c) +{ + u32 r; + + asm ("lop3.b32 %0, %1, %2, %3, 0x39;" : "=r" (r) : "r" (a), "r" (b), "r" (c)); + + return r; +} + +static inline u32 lut3_59_S (const u32 a, const u32 b, const u32 c) +{ + u32 r; + + asm ("lop3.b32 %0, %1, %2, %3, 0x59;" : "=r" (r) : "r" (a), "r" (b), "r" (c)); + + return r; +} + +static inline u32 lut3_96_S (const u32 a, const u32 b, const u32 c) +{ + u32 r; + + asm ("lop3.b32 %0, %1, %2, %3, 0x96;" : "=r" (r) : "r" (a), "r" (b), "r" (c)); + + return r; +} + +static inline u32 lut3_e4_S (const u32 a, const u32 b, const u32 c) +{ + u32 r; + + asm ("lop3.b32 %0, %1, %2, %3, 0xe4;" : "=r" (r) : "r" (a), "r" (b), "r" (c)); + + return r; +} + +static inline u32 lut3_e8_S (const u32 a, const u32 b, const u32 c) +{ + u32 r; + + asm ("lop3.b32 %0, %1, %2, %3, 0xe8;" : "=r" (r) : "r" (a), "r" (b), "r" (c)); + + return r; +} + +static inline u32 lut3_ca_S (const u32 a, const u32 b, const u32 c) +{ + u32 r; + + asm ("lop3.b32 %0, %1, %2, %3, 0xca;" : "=r" (r) : "r" (a), "r" (b), "r" (c)); + + return r; +} #endif -#ifdef IS_NV static inline u32 __byte_perm_S (const u32 a, const u32 b, const u32 c) { u32 r; @@ -135,6 +566,46 @@ static inline u32 __byte_perm_S (const u32 a, const u32 b, const u32 c) return r; } +static inline u32x swap32 (const u32x v) +{ + return ((v >> 24) & 0x000000ff) + | ((v >> 8) & 0x0000ff00) + | ((v << 8) & 0x00ff0000) + | ((v << 24) & 0xff000000); +} + +static inline u64x swap64 (const u64x v) +{ + return ((v >> 56) & 0x00000000000000ff) + | ((v >> 40) & 0x000000000000ff00) + | ((v >> 24) & 0x0000000000ff0000) + | ((v >> 8) & 0x00000000ff000000) + | ((v << 8) & 0x000000ff00000000) + | ((v << 24) & 0x0000ff0000000000) + | ((v << 40) & 0x00ff000000000000) + | ((v << 56) & 0xff00000000000000); +} + +static inline u32x rotr32 (const u32x a, const u32 n) +{ + return rotate (a, 32 - n); +} + +static inline u32x rotl32 (const u32x a, const u32 n) +{ + return rotate (a, n); +} + +static inline u64x rotr64 (const u64x a, const u32 n) +{ + return rotate (a, (u64) 64 - n); +} + +static inline u64x rotl64 (const u64x a, const u32 n) +{ + return rotate (a, (u64) n); +} + static inline u32x __byte_perm (const u32x a, const u32x b, const u32x c) { u32x r; @@ -143,23 +614,17 @@ static inline u32x __byte_perm (const u32x a, const u32x b, const u32x c) asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r) : "r"(a), "r"(b), "r"(c) ); #endif - #if VECT_SIZE == 2 + #if VECT_SIZE >= 2 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s0) : "r"(a.s0), "r"(b.s0), "r"(c.s0)); asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s1) : "r"(a.s1), "r"(b.s1), "r"(c.s1)); #endif - #if VECT_SIZE == 4 - asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s0) : "r"(a.s0), "r"(b.s0), "r"(c.s0)); - asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s1) : "r"(a.s1), "r"(b.s1), "r"(c.s1)); + #if VECT_SIZE >= 4 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s2) : "r"(a.s2), "r"(b.s2), "r"(c.s2)); asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s3) : "r"(a.s3), "r"(b.s3), "r"(c.s3)); #endif - #if VECT_SIZE == 8 - asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s0) : "r"(a.s0), "r"(b.s0), "r"(c.s0)); - asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s1) : "r"(a.s1), "r"(b.s1), "r"(c.s1)); - asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s2) : "r"(a.s2), "r"(b.s2), "r"(c.s2)); - asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s3) : "r"(a.s3), "r"(b.s3), "r"(c.s3)); + #if VECT_SIZE >= 8 asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s4) : "r"(a.s4), "r"(b.s4), "r"(c.s4)); asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s5) : "r"(a.s5), "r"(b.s5), "r"(c.s5)); asm ("prmt.b32 %0, %1, %2, %3;" : "=r"(r.s6) : "r"(a.s6), "r"(b.s6), "r"(c.s6)); @@ -193,160 +658,7 @@ static inline u32 amd_bytealign (const u32 a, const u32 b, const u32 c) return __byte_perm_S (b, a, (0x76543210 >> ((c & 3) * 4)) & 0xffff); } #endif -#endif -#ifdef IS_GENERIC -static inline u32 __bfe (const u32 a, const u32 b, const u32 c) -{ - #define BIT(x) (1 << (x)) - #define BIT_MASK(x) (BIT (x) - 1) - #define BFE(x,y,z) (((x) >> (y)) & BIT_MASK (z)) - - return BFE (a, b, c); -} - -static inline u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c) -{ - const u64 tmp = ((((u64) a) << 32) | ((u64) b)) >> ((c & 3) * 8); - - return (u32) (tmp); -} - -static inline u32x amd_bytealign (const u32x a, const u32x b, const u32 c) -{ - #if VECT_SIZE == 1 - const u64x tmp = ((((u64x) (a)) << 32) | ((u64x) (b))) >> ((c & 3) * 8); - - return (u32x) (tmp); - #endif - - #if VECT_SIZE == 2 - const u64x tmp = ((((u64x) (a.s0, a.s1)) << 32) | ((u64x) (b.s0, b.s1))) >> ((c & 3) * 8); - - return (u32x) (tmp.s0, tmp.s1); - #endif - - #if VECT_SIZE == 4 - const u64x tmp = ((((u64x) (a.s0, a.s1, a.s2, a.s3)) << 32) | ((u64x) (b.s0, b.s1, b.s2, b.s3))) >> ((c & 3) * 8); - - return (u32x) (tmp.s0, tmp.s1, tmp.s2, tmp.s3); - #endif - - #if VECT_SIZE == 8 - const u64x tmp = ((((u64x) (a.s0, a.s1, a.s2, a.s3, a.s4, a.s5, a.s6, a.s7)) << 32) | ((u64x) (b.s0, b.s1, b.s2, b.s3, b.s4, b.s5, b.s6, b.s7))) >> ((c & 3) * 8); - - return (u32x) (tmp.s0, tmp.s1, tmp.s2, tmp.s3, tmp.s4, tmp.s5, tmp.s6, tmp.s7); - #endif -} -#endif - -#ifdef IS_AMD -static inline u32x rotr32 (const u32x a, const u32 n) -{ - return rotate (a, 32 - n); -} - -static inline u32x rotl32 (const u32x a, const u32 n) -{ - return rotate (a, n); -} - -static inline u64 rotr64 (const u64 a, const u32 n) -{ - uint2 a2 = as_uint2 (a); - - uint2 t; - - t.s0 = (n >= 32) ? amd_bitalign (a2.s0, a2.s1, n - 32) - : amd_bitalign (a2.s1, a2.s0, n); - t.s1 = (n >= 32) ? amd_bitalign (a2.s1, a2.s0, n - 32) - : amd_bitalign (a2.s0, a2.s1, n); - - return as_ulong (t); -} - -static inline u64 rotl64 (const u64 a, const u32 n) -{ - return rotr64 (a, 64 - n); -} -#endif - -#ifdef IS_NV -static inline u32x rotr32 (const u32x a, const u32 n) -{ - return rotate (a, 32 - n); -} - -static inline u32x rotl32 (const u32x a, const u32 n) -{ - return rotate (a, n); -} - -#if CUDA_ARCH >= 350 -static inline u64 rotr64 (const u64 a, const u32 n) -{ - u32 il; - u32 ir; - - asm ("mov.b64 {%0, %1}, %2;" : "=r"(il), "=r"(ir) : "l"(a)); - - u32 tl; - u32 tr; - - if (n >= 32) - { - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(ir), "r"(il), "r"(n - 32)); - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(il), "r"(ir), "r"(n - 32)); - } - else - { - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tl) : "r"(il), "r"(ir), "r"(n)); - asm ("shf.r.wrap.b32 %0, %1, %2, %3;" : "=r"(tr) : "r"(ir), "r"(il), "r"(n)); - } - - u64 r; - - asm ("mov.b64 %0, {%1, %2};" : "=l"(r) : "r"(tl), "r"(tr)); - - return r; -} -#else -static inline u64 rotr64 (const u64 a, const u32 n) -{ - return rotate (a, (u64) 64 - n); -} -#endif - -static inline u64 rotl64 (const u64 a, const u32 n) -{ - return rotr64 (a, 64 - n); -} -#endif - -#ifdef IS_GENERIC - -static inline u32x rotr32 (const u32x a, const u32x n) -{ - return rotate (a, 32 - n); -} - -static inline u32x rotl32 (const u32x a, const u32x n) -{ - return rotate (a, n); -} - -static inline u64 rotr64 (const u64 a, const u32 n) -{ - return rotate (a, (u64) 64 - n); -} - -static inline u64 rotl64 (const u64 a, const u32 n) -{ - return rotate (a, (u64) n); -} -#endif - -#ifdef IS_NV #if CUDA_ARCH >= 500 static inline u32x lut3_2d (const u32x a, const u32x b, const u32x c) { @@ -356,23 +668,17 @@ static inline u32x lut3_2d (const u32x a, const u32x b, const u32x c) asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r) : "r" (a), "r" (b), "r" (c)); #endif - #if VECT_SIZE == 2 + #if VECT_SIZE >= 2 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0)); asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1)); #endif - #if VECT_SIZE == 4 - asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0)); - asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1)); + #if VECT_SIZE >= 4 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2)); asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3)); #endif - #if VECT_SIZE == 8 - asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s0) : "r" (a.s0), "r" (b.s0), "r" (c.s0)); - asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s1) : "r" (a.s1), "r" (b.s1), "r" (c.s1)); - asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s2) : "r" (a.s2), "r" (b.s2), "r" (c.s2)); - asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s3) : "r" (a.s3), "r" (b.s3), "r" (c.s3)); + #if VECT_SIZE >= 8 asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s4) : "r" (a.s4), "r" (b.s4), "r" (c.s4)); asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s5) : "r" (a.s5), "r" (b.s5), "r" (c.s5)); asm ("lop3.b32 %0, %1, %2, %3, 0x2d;" : "=r" (r.s6) : "r" (a.s6), "r" (b.s6), "r" (c.s6)); @@ -589,6 +895,121 @@ static inline u32x lut3_ca (const u32x a, const u32x b, const u32x c) #endif #endif +#ifdef IS_GENERIC +static inline u32 swap32_S (const u32 v) +{ + return (as_uint (as_uchar4 (v).s3210)); +} + +static inline u64 swap64_S (const u64 v) +{ + return (as_ulong (as_uchar8 (v).s76543210)); +} + +static inline u32 rotr32_S (const u32 a, const u32 n) +{ + return rotate (a, 32 - n); +} + +static inline u32 rotl32_S (const u32 a, const u32 n) +{ + return rotate (a, n); +} + +static inline u64 rotr64_S (const u64 a, const u32 n) +{ + return rotate (a, (u64) 64 - n); +} + +static inline u64 rotl64_S (const u64 a, const u32 n) +{ + return rotate (a, (u64) n); +} + +static inline u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c) +{ + const u64 tmp = ((((u64) a) << 32) | ((u64) b)) >> ((c & 3) * 8); + + return (u32) (tmp); +} + +static inline u32x swap32 (const u32x v) +{ + return ((v >> 24) & 0x000000ff) + | ((v >> 8) & 0x0000ff00) + | ((v << 8) & 0x00ff0000) + | ((v << 24) & 0xff000000); +} + +static inline u64x swap64 (const u64x v) +{ + return ((v >> 56) & 0x00000000000000ff) + | ((v >> 40) & 0x000000000000ff00) + | ((v >> 24) & 0x0000000000ff0000) + | ((v >> 8) & 0x00000000ff000000) + | ((v << 8) & 0x000000ff00000000) + | ((v << 24) & 0x0000ff0000000000) + | ((v << 40) & 0x00ff000000000000) + | ((v << 56) & 0xff00000000000000); +} + +static inline u32x rotr32 (const u32x a, const u32 n) +{ + return rotate (a, 32 - n); +} + +static inline u32x rotl32 (const u32x a, const u32 n) +{ + return rotate (a, n); +} + +static inline u64x rotr64 (const u64x a, const u32 n) +{ + return rotate (a, (u64) 64 - n); +} + +static inline u64x rotl64 (const u64x a, const u32 n) +{ + return rotate (a, (u64) n); +} + +static inline u32 __bfe (const u32 a, const u32 b, const u32 c) +{ + #define BIT(x) (1 << (x)) + #define BIT_MASK(x) (BIT (x) - 1) + #define BFE(x,y,z) (((x) >> (y)) & BIT_MASK (z)) + + return BFE (a, b, c); +} + +static inline u32x amd_bytealign (const u32x a, const u32x b, const u32 c) +{ + #if VECT_SIZE == 1 + const u64x tmp = ((((u64x) (a)) << 32) | ((u64x) (b))) >> ((c & 3) * 8); + + return (u32x) (tmp); + #endif + + #if VECT_SIZE == 2 + const u64x tmp = ((((u64x) (a.s0, a.s1)) << 32) | ((u64x) (b.s0, b.s1))) >> ((c & 3) * 8); + + return (u32x) (tmp.s0, tmp.s1); + #endif + + #if VECT_SIZE == 4 + const u64x tmp = ((((u64x) (a.s0, a.s1, a.s2, a.s3)) << 32) | ((u64x) (b.s0, b.s1, b.s2, b.s3))) >> ((c & 3) * 8); + + return (u32x) (tmp.s0, tmp.s1, tmp.s2, tmp.s3); + #endif + + #if VECT_SIZE == 8 + const u64x tmp = ((((u64x) (a.s0, a.s1, a.s2, a.s3, a.s4, a.s5, a.s6, a.s7)) << 32) | ((u64x) (b.s0, b.s1, b.s2, b.s3, b.s4, b.s5, b.s6, b.s7))) >> ((c & 3) * 8); + + return (u32x) (tmp.s0, tmp.s1, tmp.s2, tmp.s3, tmp.s4, tmp.s5, tmp.s6, tmp.s7); + #endif +} +#endif + typedef struct { #if defined _DES_ diff --git a/include/kernel_functions.c b/include/kernel_functions.c index 7ef6fa589..44cf1b940 100644 --- a/include/kernel_functions.c +++ b/include/kernel_functions.c @@ -5,6 +5,10 @@ #if defined _MD4_ || defined _DCC2_ || defined _NETNTLMV2_ || defined _KRB5PA_ || defined _MS_DRSR_ +#define MD4_F_S(x,y,z) (((x) & (y)) | ((~(x)) & (z))) +#define MD4_G_S(x,y,z) (((x) & (y)) | ((x) & (z)) | ((y) & (z))) +#define MD4_H_S(x,y,z) ((x) ^ (y) ^ (z)) + #ifdef IS_NV #if CUDA_ARCH >= 500 #define MD4_F(x,y,z) lut3_ca ((x), (y), (z)) @@ -62,6 +66,11 @@ #if defined _MD5_ || defined _MD5H_ || defined _SAPB_ || defined _OLDOFFICE01_ || defined _WPA_ || defined _MD5_SHA1_ || defined _SHA1_MD5_ || defined _NETNTLMV2_ || defined _KRB5PA_ || defined _PBKDF2_MD5_ +#define MD5_F_S(x,y,z) ((z) ^ ((x) & ((y) ^ (z)))) +#define MD5_G_S(x,y,z) ((y) ^ ((z) & ((x) ^ (y)))) +#define MD5_H_S(x,y,z) ((x) ^ (y) ^ (z)) +#define MD5_I_S(x,y,z) ((y) ^ ((x) | ~(z))) + #ifdef IS_NV #if CUDA_ARCH >= 500 #define MD5_F(x,y,z) lut3_ca ((x), (y), (z)) @@ -272,7 +281,7 @@ #define SHA384_STEP(F0,F1,a,b,c,d,e,f,g,h,x,K) \ { \ - u64 temp0; \ + u64x temp0; \ temp0 = K; \ temp0 += x; \ temp0 += h; \ @@ -316,7 +325,7 @@ #define SHA512_STEP(F0,F1,a,b,c,d,e,f,g,h,x,K) \ { \ - u64 temp0; \ + u64x temp0; \ temp0 = K; \ temp0 += x; \ temp0 += h; \ diff --git a/include/kernel_vendor.h b/include/kernel_vendor.h index 6b5c41891..0b6a57fe7 100644 --- a/include/kernel_vendor.h +++ b/include/kernel_vendor.h @@ -32,3 +32,10 @@ #ifdef IS_NV #endif + +/** + * Generic + */ + +#ifdef IS_GENERIC +#endif \ No newline at end of file diff --git a/include/shared.h b/include/shared.h index 5e89830b3..f1269860d 100644 --- a/include/shared.h +++ b/include/shared.h @@ -1411,7 +1411,11 @@ extern hc_thread_mutex_t mux_display; #define OPTI_TYPE_SINGLE_HASH (1 << 11) #define OPTI_TYPE_SINGLE_SALT (1 << 12) #define OPTI_TYPE_BRUTE_FORCE (1 << 13) -#define OPTI_TYPE_RAW_HASH (1 << 15) +#define OPTI_TYPE_RAW_HASH (1 << 14) +#define OPTI_TYPE_USES_BITS_8 (1 << 15) +#define OPTI_TYPE_USES_BITS_16 (1 << 16) +#define OPTI_TYPE_USES_BITS_32 (1 << 17) +#define OPTI_TYPE_USES_BITS_64 (1 << 18) #define OPTI_STR_ZERO_BYTE "Zero-Byte" #define OPTI_STR_PRECOMPUTE_INIT "Precompute-Init" diff --git a/src/oclHashcat.c b/src/oclHashcat.c index 426d453af..1873b39ff 100644 --- a/src/oclHashcat.c +++ b/src/oclHashcat.c @@ -7725,6 +7725,7 @@ int main (int argc, char **argv) | OPTI_TYPE_EARLY_SKIP | OPTI_TYPE_NOT_ITERATED | OPTI_TYPE_NOT_SALTED + | OPTI_TYPE_USES_BITS_64 | OPTI_TYPE_RAW_HASH; dgst_pos0 = 14; dgst_pos1 = 15; @@ -7748,6 +7749,7 @@ int main (int argc, char **argv) | OPTI_TYPE_EARLY_SKIP | OPTI_TYPE_NOT_ITERATED | OPTI_TYPE_APPENDED_SALT + | OPTI_TYPE_USES_BITS_64 | OPTI_TYPE_RAW_HASH; dgst_pos0 = 14; dgst_pos1 = 15; @@ -7771,6 +7773,7 @@ int main (int argc, char **argv) | OPTI_TYPE_EARLY_SKIP | OPTI_TYPE_NOT_ITERATED | OPTI_TYPE_APPENDED_SALT + | OPTI_TYPE_USES_BITS_64 | OPTI_TYPE_RAW_HASH; dgst_pos0 = 14; dgst_pos1 = 15; @@ -7794,6 +7797,7 @@ int main (int argc, char **argv) | OPTI_TYPE_EARLY_SKIP | OPTI_TYPE_NOT_ITERATED | OPTI_TYPE_PREPENDED_SALT + | OPTI_TYPE_USES_BITS_64 | OPTI_TYPE_RAW_HASH; dgst_pos0 = 14; dgst_pos1 = 15; @@ -7818,6 +7822,7 @@ int main (int argc, char **argv) | OPTI_TYPE_EARLY_SKIP | OPTI_TYPE_NOT_ITERATED | OPTI_TYPE_PREPENDED_SALT + | OPTI_TYPE_USES_BITS_64 | OPTI_TYPE_RAW_HASH; dgst_pos0 = 14; dgst_pos1 = 15; @@ -7842,6 +7847,7 @@ int main (int argc, char **argv) | OPTI_TYPE_EARLY_SKIP | OPTI_TYPE_NOT_ITERATED | OPTI_TYPE_APPENDED_SALT + | OPTI_TYPE_USES_BITS_64 | OPTI_TYPE_RAW_HASH; dgst_pos0 = 14; dgst_pos1 = 15; @@ -7867,6 +7873,7 @@ int main (int argc, char **argv) | OPTI_TYPE_EARLY_SKIP | OPTI_TYPE_NOT_ITERATED | OPTI_TYPE_APPENDED_SALT + | OPTI_TYPE_USES_BITS_64 | OPTI_TYPE_RAW_HASH; dgst_pos0 = 14; dgst_pos1 = 15; @@ -7891,6 +7898,7 @@ int main (int argc, char **argv) | OPTI_TYPE_EARLY_SKIP | OPTI_TYPE_NOT_ITERATED | OPTI_TYPE_PREPENDED_SALT + | OPTI_TYPE_USES_BITS_64 | OPTI_TYPE_RAW_HASH; dgst_pos0 = 14; dgst_pos1 = 15; @@ -7908,6 +7916,7 @@ int main (int argc, char **argv) parse_func = hmacsha512_parse_hash; sort_by_digest = sort_by_digest_8_8; opti_type = OPTI_TYPE_ZERO_BYTE + | OPTI_TYPE_USES_BITS_64 | OPTI_TYPE_NOT_ITERATED; dgst_pos0 = 14; dgst_pos1 = 15; @@ -7926,6 +7935,7 @@ int main (int argc, char **argv) parse_func = hmacsha512_parse_hash; sort_by_digest = sort_by_digest_8_8; opti_type = OPTI_TYPE_ZERO_BYTE + | OPTI_TYPE_USES_BITS_64 | OPTI_TYPE_NOT_ITERATED; dgst_pos0 = 14; dgst_pos1 = 15; @@ -7941,7 +7951,8 @@ int main (int argc, char **argv) dgst_size = DGST_SIZE_8_8; parse_func = sha512crypt_parse_hash; sort_by_digest = sort_by_digest_8_8; - opti_type = OPTI_TYPE_ZERO_BYTE; + opti_type = OPTI_TYPE_ZERO_BYTE + | OPTI_TYPE_USES_BITS_64; dgst_pos0 = 0; dgst_pos1 = 1; dgst_pos2 = 2; @@ -8373,6 +8384,7 @@ int main (int argc, char **argv) parse_func = keccak_parse_hash; sort_by_digest = sort_by_digest_8_25; opti_type = OPTI_TYPE_ZERO_BYTE + | OPTI_TYPE_USES_BITS_64 | OPTI_TYPE_RAW_HASH; dgst_pos0 = 2; dgst_pos1 = 3; @@ -8607,7 +8619,8 @@ int main (int argc, char **argv) dgst_size = DGST_SIZE_8_8; parse_func = truecrypt_parse_hash_1k; sort_by_digest = sort_by_digest_8_8; - opti_type = OPTI_TYPE_ZERO_BYTE; + opti_type = OPTI_TYPE_ZERO_BYTE + | OPTI_TYPE_USES_BITS_64; dgst_pos0 = 0; dgst_pos1 = 1; dgst_pos2 = 2; @@ -8622,7 +8635,8 @@ int main (int argc, char **argv) dgst_size = DGST_SIZE_8_8; parse_func = truecrypt_parse_hash_1k; sort_by_digest = sort_by_digest_8_8; - opti_type = OPTI_TYPE_ZERO_BYTE; + opti_type = OPTI_TYPE_ZERO_BYTE + | OPTI_TYPE_USES_BITS_64; dgst_pos0 = 0; dgst_pos1 = 1; dgst_pos2 = 2; @@ -8637,7 +8651,8 @@ int main (int argc, char **argv) dgst_size = DGST_SIZE_8_8; parse_func = truecrypt_parse_hash_1k; sort_by_digest = sort_by_digest_8_8; - opti_type = OPTI_TYPE_ZERO_BYTE; + opti_type = OPTI_TYPE_ZERO_BYTE + | OPTI_TYPE_USES_BITS_64; dgst_pos0 = 0; dgst_pos1 = 1; dgst_pos2 = 2; @@ -8772,7 +8787,8 @@ int main (int argc, char **argv) dgst_size = DGST_SIZE_8_8; parse_func = sha512aix_parse_hash; sort_by_digest = sort_by_digest_8_8; - opti_type = OPTI_TYPE_ZERO_BYTE; + opti_type = OPTI_TYPE_ZERO_BYTE + | OPTI_TYPE_USES_BITS_64; dgst_pos0 = 0; dgst_pos1 = 1; dgst_pos2 = 2; @@ -8847,7 +8863,8 @@ int main (int argc, char **argv) dgst_size = DGST_SIZE_8_16; parse_func = sha512osx_parse_hash; sort_by_digest = sort_by_digest_8_16; - opti_type = OPTI_TYPE_ZERO_BYTE; + opti_type = OPTI_TYPE_ZERO_BYTE + | OPTI_TYPE_USES_BITS_64; dgst_pos0 = 0; dgst_pos1 = 1; dgst_pos2 = 2; @@ -8862,7 +8879,8 @@ int main (int argc, char **argv) dgst_size = DGST_SIZE_8_16; parse_func = sha512grub_parse_hash; sort_by_digest = sort_by_digest_8_16; - opti_type = OPTI_TYPE_ZERO_BYTE; + opti_type = OPTI_TYPE_ZERO_BYTE + | OPTI_TYPE_USES_BITS_64; dgst_pos0 = 0; dgst_pos1 = 1; dgst_pos2 = 2; @@ -8985,7 +9003,8 @@ int main (int argc, char **argv) dgst_size = DGST_SIZE_8_8; parse_func = drupal7_parse_hash; sort_by_digest = sort_by_digest_8_8; - opti_type = OPTI_TYPE_ZERO_BYTE; + opti_type = OPTI_TYPE_ZERO_BYTE + | OPTI_TYPE_USES_BITS_64; dgst_pos0 = 0; dgst_pos1 = 1; dgst_pos2 = 2; @@ -9592,6 +9611,7 @@ int main (int argc, char **argv) | OPTI_TYPE_EARLY_SKIP | OPTI_TYPE_NOT_ITERATED | OPTI_TYPE_NOT_SALTED + | OPTI_TYPE_USES_BITS_64 | OPTI_TYPE_RAW_HASH; dgst_pos0 = 6; dgst_pos1 = 7; @@ -9815,7 +9835,8 @@ int main (int argc, char **argv) dgst_size = DGST_SIZE_8_16; parse_func = pbkdf2_sha512_parse_hash; sort_by_digest = sort_by_digest_8_16; - opti_type = OPTI_TYPE_ZERO_BYTE; + opti_type = OPTI_TYPE_ZERO_BYTE + | OPTI_TYPE_USES_BITS_64; dgst_pos0 = 0; dgst_pos1 = 1; dgst_pos2 = 2; @@ -9830,7 +9851,8 @@ int main (int argc, char **argv) dgst_size = DGST_SIZE_8_8; parse_func = ecryptfs_parse_hash; sort_by_digest = sort_by_digest_8_8; - opti_type = OPTI_TYPE_ZERO_BYTE; + opti_type = OPTI_TYPE_ZERO_BYTE + | OPTI_TYPE_USES_BITS_64; dgst_pos0 = 0; dgst_pos1 = 1; dgst_pos2 = 2; @@ -9845,7 +9867,8 @@ int main (int argc, char **argv) dgst_size = DGST_SIZE_8_16; parse_func = oraclet_parse_hash; sort_by_digest = sort_by_digest_8_16; - opti_type = OPTI_TYPE_ZERO_BYTE; + opti_type = OPTI_TYPE_ZERO_BYTE + | OPTI_TYPE_USES_BITS_64; dgst_pos0 = 0; dgst_pos1 = 1; dgst_pos2 = 2; @@ -12535,14 +12558,6 @@ int main (int argc, char **argv) device_param->platform_devices_id = platform_devices_id; - // vendor_id - - cl_uint vendor_id = 0; - - hc_clGetDeviceInfo (device_param->device, CL_DEVICE_VENDOR_ID, sizeof (vendor_id), &vendor_id, NULL); - - device_param->vendor_id = vendor_id; - // device_type cl_device_type device_type; @@ -12553,6 +12568,14 @@ int main (int argc, char **argv) device_param->device_type = device_type; + // vendor_id + + cl_uint vendor_id = 0; + + hc_clGetDeviceInfo (device_param->device, CL_DEVICE_VENDOR_ID, sizeof (vendor_id), &vendor_id, NULL); + + device_param->vendor_id = vendor_id; + // device_name char *device_name = (char *) mymalloc (INFOSZ); @@ -12574,7 +12597,7 @@ int main (int argc, char **argv) // pocl returns the real vendor_id in CL_DEVICE_VENDOR_ID which causes many problems because of hms and missing amd_bfe () etc // we need to overwrite vendor_id to avoid this. maybe open pocl issue? - cl_uint vendor_id = 0xffff; + cl_uint vendor_id = VENDOR_ID_GENERIC; device_param->vendor_id = vendor_id; } @@ -12583,20 +12606,25 @@ int main (int argc, char **argv) cl_uint vector_width; - if (1) // can be removed as soon as all kernel are migrated; if (attack_mode == ATTACK_MODE_BF) + if (opencl_vector_width == OPENCL_VECTOR_WIDTH) { - if (opencl_vector_width == OPENCL_VECTOR_WIDTH) + hc_clGetDeviceInfo (device_param->device, CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, sizeof (vector_width), &vector_width, NULL); + + if ((vendor_id == VENDOR_ID_NV) && (strstr (device_name, " Ti") || strstr (device_name, " TI"))) { - hc_clGetDeviceInfo (device_param->device, CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, sizeof (vector_width), &vector_width, NULL); + // Yeah that's a super bad hack, but there's no other attribute we could use + + if (vector_width < 2) vector_width *= 2; } - else + + if (opti_type & OPTI_TYPE_USES_BITS_64) { - vector_width = opencl_vector_width; + if (vector_width > 1) vector_width /= 2; } } else { - vector_width = 1; + vector_width = opencl_vector_width; } if (vector_width > 8) vector_width = 8; @@ -13207,7 +13235,7 @@ int main (int argc, char **argv) * create command-queue */ - // not support with NV + // not supported with NV // device_param->command_queue = hc_clCreateCommandQueueWithProperties (device_param->context, device_param->device, NULL); device_param->command_queue = hc_clCreateCommandQueue (device_param->context, device_param->device, 0); @@ -13224,10 +13252,20 @@ int main (int argc, char **argv) if (device_type & CL_DEVICE_TYPE_CPU) { - // CPU still need lots of workitems, don't know why... - // for testing phase, lets start with this - -// kernel_accel = 1; + if (benchmark_mode == 0) + { + if (kernel_accel > 16) + { + kernel_accel = 16; + } + } + else + { + if (kernel_accel > 64) + { + kernel_accel = 64; + } + } } uint kernel_power = device_processors * kernel_threads * kernel_accel; @@ -13447,7 +13485,7 @@ int main (int argc, char **argv) // we don't have sm_* on vendors not NV but it doesn't matter - sprintf (build_opts, "-I%s/ -DVENDOR_ID=%d -DCUDA_ARCH=%d -DVECT_SIZE=%u", shared_dir, device_param->vendor_id, (device_param->sm_major * 100) + device_param->sm_minor, device_param->vector_width); + sprintf (build_opts, "-I%s/ -DVENDOR_ID=%d -DCUDA_ARCH=%d -DVECT_SIZE=%u -DDEVICE_TYPE=%u", shared_dir, device_param->vendor_id, (device_param->sm_major * 100) + device_param->sm_minor, device_param->vector_width, (u32) device_param->device_type); /** * main kernel diff --git a/tools/test.sh b/tools/test.sh index 36e0bc670..80682f7ff 100755 --- a/tools/test.sh +++ b/tools/test.sh @@ -19,7 +19,7 @@ NEVER_CRACK="11600" SLOW_ALGOS="400 500 501 1600 1800 2100 2500 3200 5200 5800 6211 6221 6231 6241 6251 6261 6271 6281 6300 6400 6500 6600 6700 6800 7100 7200 7400 7900 8200 8800 8900 9000 9100 9200 9300 9400 9500 9600 10000 10300 10500 10700 10900 11300 11600 11900 12000 12100 12200 12300 12400 12500 12800 12900 13000" -OPTS="--quiet --force --potfile-disable --runtime 200 --gpu-temp-disable --weak-hash-threshold=0 --opencl-device-types 2" +OPTS="--quiet --force --potfile-disable --runtime 200 --gpu-temp-disable --weak-hash-threshold=0 --opencl-device-types 2 --opencl-vector-width 2" OUTD="test_$(date +%s)"