From d76965348da0112bdb559486cefe301bf48e5084 Mon Sep 17 00:00:00 2001 From: Jens Steube Date: Mon, 10 Feb 2020 11:10:57 +0100 Subject: [PATCH] Small optimization for sha256crypt and add support for salt length up to 20 --- OpenCL/m07400-optimized.cl | 1081 +++++++++++++++++++++++++++--------- 1 file changed, 814 insertions(+), 267 deletions(-) diff --git a/OpenCL/m07400-optimized.cl b/OpenCL/m07400-optimized.cl index 5d8cf6af0..92b140bd2 100644 --- a/OpenCL/m07400-optimized.cl +++ b/OpenCL/m07400-optimized.cl @@ -24,33 +24,6 @@ typedef struct sha256crypt_tmp } sha256crypt_tmp_t; -DECLSPEC void sha256_transform_transport (const u32 *w, u32 *digest) -{ - u32 w0[4]; - u32 w1[4]; - u32 w2[4]; - u32 w3[4]; - - w0[0] = hc_swap32_S (w[ 0]); - w0[1] = hc_swap32_S (w[ 1]); - w0[2] = hc_swap32_S (w[ 2]); - w0[3] = hc_swap32_S (w[ 3]); - w1[0] = hc_swap32_S (w[ 4]); - w1[1] = hc_swap32_S (w[ 5]); - w1[2] = hc_swap32_S (w[ 6]); - w1[3] = hc_swap32_S (w[ 7]); - w2[0] = hc_swap32_S (w[ 8]); - w2[1] = hc_swap32_S (w[ 9]); - w2[2] = hc_swap32_S (w[10]); - w2[3] = hc_swap32_S (w[11]); - w3[0] = hc_swap32_S (w[12]); - w3[1] = hc_swap32_S (w[13]); - w3[2] = hc_swap32_S (w[14]); - w3[3] = hc_swap32_S (w[15]); - - sha256_transform (w0, w1, w2, w3, digest); -} - DECLSPEC void init_ctx (u32 *digest) { digest[0] = SHA256M_A; @@ -63,76 +36,29 @@ DECLSPEC void init_ctx (u32 *digest) digest[7] = SHA256M_H; } -DECLSPEC void bzero16 (u32 *block) -{ - block[ 0] = 0; - block[ 1] = 0; - block[ 2] = 0; - block[ 3] = 0; - block[ 4] = 0; - block[ 5] = 0; - block[ 6] = 0; - block[ 7] = 0; - block[ 8] = 0; - block[ 9] = 0; - block[10] = 0; - block[11] = 0; - block[12] = 0; - block[13] = 0; - block[14] = 0; - block[15] = 0; -} - -DECLSPEC void bswap8 (u32 *block) -{ - block[ 0] = hc_swap32_S (block[ 0]); - block[ 1] = hc_swap32_S (block[ 1]); - block[ 2] = hc_swap32_S (block[ 2]); - block[ 3] = hc_swap32_S (block[ 3]); - block[ 4] = hc_swap32_S (block[ 4]); - block[ 5] = hc_swap32_S (block[ 5]); - block[ 6] = hc_swap32_S (block[ 6]); - block[ 7] = hc_swap32_S (block[ 7]); -} - DECLSPEC u32 memcat16 (u32 *block, const u32 offset, const u32 *append, const u32 append_len) { - u32 tmp0; - u32 tmp1; - u32 tmp2; - u32 tmp3; - u32 tmp4; - - #if defined IS_AMD || defined IS_GENERIC u32 in0 = append[0]; u32 in1 = append[1]; u32 in2 = append[2]; u32 in3 = append[3]; - tmp0 = hc_bytealign ( 0, in0, offset); - tmp1 = hc_bytealign (in0, in1, offset); - tmp2 = hc_bytealign (in1, in2, offset); - tmp3 = hc_bytealign (in2, in3, offset); - tmp4 = hc_bytealign (in3, 0, offset); + #if defined IS_AMD || defined IS_GENERIC + const u32 tmp0 = hc_bytealign_be ( 0, in0, offset); + const u32 tmp1 = hc_bytealign_be (in0, in1, offset); + const u32 tmp2 = hc_bytealign_be (in1, in2, offset); + const u32 tmp3 = hc_bytealign_be (in2, in3, offset); + const u32 tmp4 = hc_bytealign_be (in3, 0, offset); #endif #ifdef IS_NV - const int offset_mod_4 = offset & 3; + const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; - const int offset_minus_4 = 4 - offset_mod_4; - - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - - u32 in0 = append[0]; - u32 in1 = append[1]; - u32 in2 = append[2]; - u32 in3 = append[3]; - - tmp0 = hc_byte_perm ( 0, in0, selector); - tmp1 = hc_byte_perm (in0, in1, selector); - tmp2 = hc_byte_perm (in1, in2, selector); - tmp3 = hc_byte_perm (in2, in3, selector); - tmp4 = hc_byte_perm (in3, 0, selector); + const u32 tmp0 = hc_byte_perm_S (in0, 0, selector); + const u32 tmp1 = hc_byte_perm_S (in1, in0, selector); + const u32 tmp2 = hc_byte_perm_S (in2, in1, selector); + const u32 tmp3 = hc_byte_perm_S (in3, in2, selector); + const u32 tmp4 = hc_byte_perm_S (0, in3, selector); #endif switch (offset / 4) @@ -232,45 +158,30 @@ DECLSPEC u32 memcat16 (u32 *block, const u32 offset, const u32 *append, const u3 DECLSPEC u32 memcat16c (u32 *block, const u32 offset, const u32 *append, const u32 append_len, u32 *digest) { - u32 tmp0; - u32 tmp1; - u32 tmp2; - u32 tmp3; - u32 tmp4; - - #if defined IS_AMD || defined IS_GENERIC u32 in0 = append[0]; u32 in1 = append[1]; u32 in2 = append[2]; u32 in3 = append[3]; - tmp0 = hc_bytealign ( 0, in0, offset); - tmp1 = hc_bytealign (in0, in1, offset); - tmp2 = hc_bytealign (in1, in2, offset); - tmp3 = hc_bytealign (in2, in3, offset); - tmp4 = hc_bytealign (in3, 0, offset); + #if defined IS_AMD || defined IS_GENERIC + const u32 tmp0 = hc_bytealign_be ( 0, in0, offset); + const u32 tmp1 = hc_bytealign_be (in0, in1, offset); + const u32 tmp2 = hc_bytealign_be (in1, in2, offset); + const u32 tmp3 = hc_bytealign_be (in2, in3, offset); + const u32 tmp4 = hc_bytealign_be (in3, 0, offset); #endif #ifdef IS_NV - const int offset_mod_4 = offset & 3; + const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; - const int offset_minus_4 = 4 - offset_mod_4; - - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - - u32 in0 = append[0]; - u32 in1 = append[1]; - u32 in2 = append[2]; - u32 in3 = append[3]; - - tmp0 = hc_byte_perm ( 0, in0, selector); - tmp1 = hc_byte_perm (in0, in1, selector); - tmp2 = hc_byte_perm (in1, in2, selector); - tmp3 = hc_byte_perm (in2, in3, selector); - tmp4 = hc_byte_perm (in3, 0, selector); + const u32 tmp0 = hc_byte_perm_S (in0, 0, selector); + const u32 tmp1 = hc_byte_perm_S (in1, in0, selector); + const u32 tmp2 = hc_byte_perm_S (in2, in1, selector); + const u32 tmp3 = hc_byte_perm_S (in3, in2, selector); + const u32 tmp4 = hc_byte_perm_S (0, in3, selector); #endif - u32 carry[4] = { 0, 0, 0, 0 }; + u32 carry[4] = { 0 }; switch (offset / 4) { @@ -378,57 +289,487 @@ DECLSPEC u32 memcat16c (u32 *block, const u32 offset, const u32 *append, const u { new_len -= 64; - sha256_transform_transport (block, digest); + sha256_transform (block + 0, block + 4, block + 8, block + 12, digest); - bzero16 (block); - - block[0] = carry[0]; - block[1] = carry[1]; - block[2] = carry[2]; - block[3] = carry[3]; + block[ 0] = carry[0]; + block[ 1] = carry[1]; + block[ 2] = carry[2]; + block[ 3] = carry[3]; + block[ 4] = 0; + block[ 5] = 0; + block[ 6] = 0; + block[ 7] = 0; + block[ 8] = 0; + block[ 9] = 0; + block[10] = 0; + block[11] = 0; + block[12] = 0; + block[13] = 0; + block[14] = 0; + block[15] = 0; } return new_len; } -DECLSPEC u32 memcat20 (u32 *block, const u32 offset, const u32 *append, const u32 append_len) +DECLSPEC u32 memcat16s (u32 *block, const u32 offset, const u32 *append, const u32 append_len) { - u32 tmp0; - u32 tmp1; - u32 tmp2; - u32 tmp3; - u32 tmp4; - - #if defined IS_AMD || defined IS_GENERIC u32 in0 = append[0]; u32 in1 = append[1]; u32 in2 = append[2]; u32 in3 = append[3]; + u32 in4 = append[4]; - tmp0 = hc_bytealign ( 0, in0, offset); - tmp1 = hc_bytealign (in0, in1, offset); - tmp2 = hc_bytealign (in1, in2, offset); - tmp3 = hc_bytealign (in2, in3, offset); - tmp4 = hc_bytealign (in3, 0, offset); + #if defined IS_AMD || defined IS_GENERIC + const u32 tmp0 = hc_bytealign_be ( 0, in0, offset); + const u32 tmp1 = hc_bytealign_be (in0, in1, offset); + const u32 tmp2 = hc_bytealign_be (in1, in2, offset); + const u32 tmp3 = hc_bytealign_be (in2, in3, offset); + const u32 tmp4 = hc_bytealign_be (in3, in4, offset); + const u32 tmp5 = hc_bytealign_be (in4, 0, offset); #endif #ifdef IS_NV - const int offset_mod_4 = offset & 3; + const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; - const int offset_minus_4 = 4 - offset_mod_4; + const u32 tmp0 = hc_byte_perm_S (in0, 0, selector); + const u32 tmp1 = hc_byte_perm_S (in1, in0, selector); + const u32 tmp2 = hc_byte_perm_S (in2, in1, selector); + const u32 tmp3 = hc_byte_perm_S (in3, in2, selector); + const u32 tmp4 = hc_byte_perm_S (in4, in3, selector); + const u32 tmp5 = hc_byte_perm_S (0, in4, selector); + #endif - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; + switch (offset / 4) + { + case 0: block[ 0] |= tmp0; + block[ 1] = tmp1; + block[ 2] = tmp2; + block[ 3] = tmp3; + block[ 4] = tmp4; + block[ 5] = tmp5; + break; + case 1: block[ 1] |= tmp0; + block[ 2] = tmp1; + block[ 3] = tmp2; + block[ 4] = tmp3; + block[ 5] = tmp4; + block[ 6] = tmp5; + break; + case 2: block[ 2] |= tmp0; + block[ 3] = tmp1; + block[ 4] = tmp2; + block[ 5] = tmp3; + block[ 6] = tmp4; + block[ 7] = tmp5; + break; + case 3: block[ 3] |= tmp0; + block[ 4] = tmp1; + block[ 5] = tmp2; + block[ 6] = tmp3; + block[ 7] = tmp4; + block[ 8] = tmp5; + break; + case 4: block[ 4] |= tmp0; + block[ 5] = tmp1; + block[ 6] = tmp2; + block[ 7] = tmp3; + block[ 8] = tmp4; + block[ 9] = tmp5; + break; + case 5: block[ 5] |= tmp0; + block[ 6] = tmp1; + block[ 7] = tmp2; + block[ 8] = tmp3; + block[ 9] = tmp4; + block[10] = tmp5; + break; + case 6: block[ 6] |= tmp0; + block[ 7] = tmp1; + block[ 8] = tmp2; + block[ 9] = tmp3; + block[10] = tmp4; + block[11] = tmp5; + break; + case 7: block[ 7] |= tmp0; + block[ 8] = tmp1; + block[ 9] = tmp2; + block[10] = tmp3; + block[11] = tmp4; + block[12] = tmp5; + break; + case 8: block[ 8] |= tmp0; + block[ 9] = tmp1; + block[10] = tmp2; + block[11] = tmp3; + block[12] = tmp4; + block[13] = tmp5; + break; + case 9: block[ 9] |= tmp0; + block[10] = tmp1; + block[11] = tmp2; + block[12] = tmp3; + block[13] = tmp4; + block[14] = tmp5; + break; + case 10: block[10] |= tmp0; + block[11] = tmp1; + block[12] = tmp2; + block[13] = tmp3; + block[14] = tmp4; + block[15] = tmp5; + break; + case 11: block[11] |= tmp0; + block[12] = tmp1; + block[13] = tmp2; + block[14] = tmp3; + block[15] = tmp4; + break; + case 12: block[12] |= tmp0; + block[13] = tmp1; + block[14] = tmp2; + block[15] = tmp3; + break; + case 13: block[13] |= tmp0; + block[14] = tmp1; + block[15] = tmp2; + break; + case 14: block[14] |= tmp0; + block[15] = tmp1; + break; + case 15: block[15] |= tmp0; + break; + } + u32 new_len = offset + append_len; + + return new_len; +} + +DECLSPEC u32 memcat16sc (u32 *block, const u32 offset, const u32 *append, const u32 append_len, u32 *digest) +{ + u32 in0 = append[0]; + u32 in1 = append[1]; + u32 in2 = append[2]; + u32 in3 = append[3]; + u32 in4 = append[4]; + + #if defined IS_AMD || defined IS_GENERIC + const u32 tmp0 = hc_bytealign_be ( 0, in0, offset); + const u32 tmp1 = hc_bytealign_be (in0, in1, offset); + const u32 tmp2 = hc_bytealign_be (in1, in2, offset); + const u32 tmp3 = hc_bytealign_be (in2, in3, offset); + const u32 tmp4 = hc_bytealign_be (in3, in4, offset); + const u32 tmp5 = hc_bytealign_be (in4, 0, offset); + #endif + + #ifdef IS_NV + const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; + + const u32 tmp0 = hc_byte_perm_S (in0, 0, selector); + const u32 tmp1 = hc_byte_perm_S (in1, in0, selector); + const u32 tmp2 = hc_byte_perm_S (in2, in1, selector); + const u32 tmp3 = hc_byte_perm_S (in3, in2, selector); + const u32 tmp4 = hc_byte_perm_S (in4, in3, selector); + const u32 tmp5 = hc_byte_perm_S (0, in4, selector); + #endif + + u32 carry[5] = { 0 }; + + switch (offset / 4) + { + case 0: block[ 0] |= tmp0; + block[ 1] = tmp1; + block[ 2] = tmp2; + block[ 3] = tmp3; + block[ 4] = tmp4; + block[ 5] = tmp5; + break; + case 1: block[ 1] |= tmp0; + block[ 2] = tmp1; + block[ 3] = tmp2; + block[ 4] = tmp3; + block[ 5] = tmp4; + block[ 6] = tmp5; + break; + case 2: block[ 2] |= tmp0; + block[ 3] = tmp1; + block[ 4] = tmp2; + block[ 5] = tmp3; + block[ 6] = tmp4; + block[ 7] = tmp5; + break; + case 3: block[ 3] |= tmp0; + block[ 4] = tmp1; + block[ 5] = tmp2; + block[ 6] = tmp3; + block[ 7] = tmp4; + block[ 8] = tmp5; + break; + case 4: block[ 4] |= tmp0; + block[ 5] = tmp1; + block[ 6] = tmp2; + block[ 7] = tmp3; + block[ 8] = tmp4; + block[ 9] = tmp5; + break; + case 5: block[ 5] |= tmp0; + block[ 6] = tmp1; + block[ 7] = tmp2; + block[ 8] = tmp3; + block[ 9] = tmp4; + block[10] = tmp5; + break; + case 6: block[ 6] |= tmp0; + block[ 7] = tmp1; + block[ 8] = tmp2; + block[ 9] = tmp3; + block[10] = tmp4; + block[11] = tmp5; + break; + case 7: block[ 7] |= tmp0; + block[ 8] = tmp1; + block[ 9] = tmp2; + block[10] = tmp3; + block[11] = tmp4; + block[12] = tmp5; + break; + case 8: block[ 8] |= tmp0; + block[ 9] = tmp1; + block[10] = tmp2; + block[11] = tmp3; + block[12] = tmp4; + block[13] = tmp5; + break; + case 9: block[ 9] |= tmp0; + block[10] = tmp1; + block[11] = tmp2; + block[12] = tmp3; + block[13] = tmp4; + block[14] = tmp5; + break; + case 10: block[10] |= tmp0; + block[11] = tmp1; + block[12] = tmp2; + block[13] = tmp3; + block[14] = tmp4; + block[15] = tmp5; + break; + case 11: block[11] |= tmp0; + block[12] = tmp1; + block[13] = tmp2; + block[14] = tmp3; + block[15] = tmp4; + carry[ 0] = tmp5; + break; + case 12: block[12] |= tmp0; + block[13] = tmp1; + block[14] = tmp2; + block[15] = tmp3; + carry[ 0] = tmp4; + carry[ 1] = tmp5; + break; + case 13: block[13] |= tmp0; + block[14] = tmp1; + block[15] = tmp2; + carry[ 0] = tmp3; + carry[ 1] = tmp4; + carry[ 2] = tmp5; + break; + case 14: block[14] |= tmp0; + block[15] = tmp1; + carry[ 0] = tmp2; + carry[ 1] = tmp3; + carry[ 2] = tmp4; + carry[ 3] = tmp5; + break; + case 15: block[15] |= tmp0; + carry[ 0] = tmp1; + carry[ 1] = tmp2; + carry[ 2] = tmp3; + carry[ 3] = tmp4; + carry[ 4] = tmp5; + break; + } + + u32 new_len = offset + append_len; + + if (new_len >= 64) + { + new_len -= 64; + + sha256_transform (block + 0, block + 4, block + 8, block + 12, digest); + + block[ 0] = carry[0]; + block[ 1] = carry[1]; + block[ 2] = carry[2]; + block[ 3] = carry[3]; + block[ 4] = carry[4]; + block[ 5] = 0; + block[ 6] = 0; + block[ 7] = 0; + block[ 8] = 0; + block[ 9] = 0; + block[10] = 0; + block[11] = 0; + block[12] = 0; + block[13] = 0; + block[14] = 0; + block[15] = 0; + } + + return new_len; +} + +DECLSPEC void truncate_block_5x4_be_S (u32 *w0, const u32 len) +{ + switch (len) + { + case 0: + w0[0] = 0; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + w0[4] = 0; + break; + + case 1: + w0[0] &= 0xff000000; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + w0[4] = 0; + break; + + case 2: + w0[0] &= 0xffff0000; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + w0[4] = 0; + break; + + case 3: + w0[0] &= 0xffffff00; + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + w0[4] = 0; + break; + + case 4: + w0[1] = 0; + w0[2] = 0; + w0[3] = 0; + w0[4] = 0; + break; + + case 5: + w0[1] &= 0xff000000; + w0[2] = 0; + w0[3] = 0; + w0[4] = 0; + break; + + case 6: + w0[1] &= 0xffff0000; + w0[2] = 0; + w0[3] = 0; + w0[4] = 0; + break; + + case 7: + w0[1] &= 0xffffff00; + w0[2] = 0; + w0[3] = 0; + w0[4] = 0; + break; + + case 8: + w0[2] = 0; + w0[3] = 0; + w0[4] = 0; + break; + + case 9: + w0[2] &= 0xff000000; + w0[3] = 0; + w0[4] = 0; + break; + + case 10: + w0[2] &= 0xffff0000; + w0[3] = 0; + w0[4] = 0; + break; + + case 11: + w0[2] &= 0xffffff00; + w0[3] = 0; + w0[4] = 0; + break; + + case 12: + w0[3] = 0; + w0[4] = 0; + break; + + case 13: + w0[3] &= 0xff000000; + w0[4] = 0; + break; + + case 14: + w0[3] &= 0xffff0000; + w0[4] = 0; + break; + + case 15: + w0[3] &= 0xffffff00; + w0[4] = 0; + break; + + case 16: + w0[4] = 0; + break; + + case 17: + w0[4] &= 0xff000000; + break; + + case 18: + w0[4] &= 0xffff0000; + break; + + case 19: + w0[4] &= 0xffffff00; + break; + } +} + +DECLSPEC u32 memcat20 (u32 *block, const u32 offset, const u32 *append, const u32 append_len) +{ u32 in0 = append[0]; u32 in1 = append[1]; u32 in2 = append[2]; u32 in3 = append[3]; - tmp0 = hc_byte_perm ( 0, in0, selector); - tmp1 = hc_byte_perm (in0, in1, selector); - tmp2 = hc_byte_perm (in1, in2, selector); - tmp3 = hc_byte_perm (in2, in3, selector); - tmp4 = hc_byte_perm (in3, 0, selector); + #if defined IS_AMD || defined IS_GENERIC + const u32 tmp0 = hc_bytealign_be_S ( 0, in0, offset); + const u32 tmp1 = hc_bytealign_be_S (in0, in1, offset); + const u32 tmp2 = hc_bytealign_be_S (in1, in2, offset); + const u32 tmp3 = hc_bytealign_be_S (in2, in3, offset); + const u32 tmp4 = hc_bytealign_be_S (in3, 0, offset); + #endif + + #ifdef IS_NV + const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; + + const u32 tmp0 = hc_byte_perm_S (in0, 0, selector); + const u32 tmp1 = hc_byte_perm_S (in1, in0, selector); + const u32 tmp2 = hc_byte_perm_S (in2, in1, selector); + const u32 tmp3 = hc_byte_perm_S (in3, in2, selector); + const u32 tmp4 = hc_byte_perm_S (0, in3, selector); #endif switch (offset / 4) @@ -536,44 +877,28 @@ DECLSPEC u32 memcat20 (u32 *block, const u32 offset, const u32 *append, const u3 DECLSPEC u32 memcat20_x80 (u32 *block, const u32 offset, const u32 *append, const u32 append_len) { - u32 tmp0; - u32 tmp1; - u32 tmp2; - u32 tmp3; - u32 tmp4; - - #if defined IS_AMD || defined IS_GENERIC u32 in0 = append[0]; u32 in1 = append[1]; u32 in2 = append[2]; u32 in3 = append[3]; - u32 in4 = 0x80; + u32 in4 = 0x80000000; - tmp0 = hc_bytealign ( 0, in0, offset); - tmp1 = hc_bytealign (in0, in1, offset); - tmp2 = hc_bytealign (in1, in2, offset); - tmp3 = hc_bytealign (in2, in3, offset); - tmp4 = hc_bytealign (in3, in4, offset); + #if defined IS_AMD || defined IS_GENERIC + const u32 tmp0 = hc_bytealign_be_S ( 0, in0, offset); + const u32 tmp1 = hc_bytealign_be_S (in0, in1, offset); + const u32 tmp2 = hc_bytealign_be_S (in1, in2, offset); + const u32 tmp3 = hc_bytealign_be_S (in2, in3, offset); + const u32 tmp4 = hc_bytealign_be_S (in3, in4, offset); #endif #ifdef IS_NV - const int offset_mod_4 = offset & 3; + const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; - const int offset_minus_4 = 4 - offset_mod_4; - - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - - u32 in0 = append[0]; - u32 in1 = append[1]; - u32 in2 = append[2]; - u32 in3 = append[3]; - u32 in4 = 0x80; - - tmp0 = hc_byte_perm ( 0, in0, selector); - tmp1 = hc_byte_perm (in0, in1, selector); - tmp2 = hc_byte_perm (in1, in2, selector); - tmp3 = hc_byte_perm (in2, in3, selector); - tmp4 = hc_byte_perm (in3, in4, selector); + const u32 tmp0 = hc_byte_perm_S (in0, 0, selector); + const u32 tmp1 = hc_byte_perm_S (in1, in0, selector); + const u32 tmp2 = hc_byte_perm_S (in2, in1, selector); + const u32 tmp3 = hc_byte_perm_S (in3, in2, selector); + const u32 tmp4 = hc_byte_perm_S (in4, in3, selector); #endif switch (offset / 4) @@ -679,6 +1004,69 @@ DECLSPEC u32 memcat20_x80 (u32 *block, const u32 offset, const u32 *append, cons return offset + append_len; } +DECLSPEC u32 memcat24 (u32 *block, const u32 offset, const u32 *append, const u32 append_len) +{ + u32 in0 = append[0]; + u32 in1 = append[1]; + u32 in2 = append[2]; + u32 in3 = append[3]; + u32 in4 = append[4]; + + #if defined IS_AMD || defined IS_GENERIC + const u32 tmp0 = hc_bytealign_be_S ( 0, in0, offset); + const u32 tmp1 = hc_bytealign_be_S (in0, in1, offset); + const u32 tmp2 = hc_bytealign_be_S (in1, in2, offset); + const u32 tmp3 = hc_bytealign_be_S (in2, in3, offset); + const u32 tmp4 = hc_bytealign_be_S (in3, in4, offset); + const u32 tmp5 = hc_bytealign_be_S (in4, 0, offset); + #endif + + #ifdef IS_NV + const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; + + const u32 tmp0 = hc_byte_perm_S (in0, 0, selector); + const u32 tmp1 = hc_byte_perm_S (in1, in0, selector); + const u32 tmp2 = hc_byte_perm_S (in2, in1, selector); + const u32 tmp3 = hc_byte_perm_S (in3, in2, selector); + const u32 tmp4 = hc_byte_perm_S (in4, in3, selector); + const u32 tmp5 = hc_byte_perm_S (0, in4, selector); + #endif + + switch (offset / 4) + { + case 0: block[ 0] |= tmp0; + block[ 1] = tmp1; + block[ 2] = tmp2; + block[ 3] = tmp3; + block[ 4] = tmp4; + block[ 5] = tmp5; + break; + case 1: block[ 1] |= tmp0; + block[ 2] = tmp1; + block[ 3] = tmp2; + block[ 4] = tmp3; + block[ 5] = tmp4; + block[ 6] = tmp5; + break; + case 2: block[ 2] |= tmp0; + block[ 3] = tmp1; + block[ 4] = tmp2; + block[ 5] = tmp3; + block[ 6] = tmp4; + block[ 7] = tmp5; + break; + case 3: block[ 3] |= tmp0; + block[ 4] = tmp1; + block[ 5] = tmp2; + block[ 6] = tmp3; + block[ 7] = tmp4; + block[ 8] = tmp5; + break; + } + + return offset + append_len; +} + KERNEL_FQ void m07400_init (KERN_ATTR_TMPS (sha256crypt_tmp_t)) { /** @@ -691,10 +1079,10 @@ KERNEL_FQ void m07400_init (KERN_ATTR_TMPS (sha256crypt_tmp_t)) u32 w0[4]; - w0[0] = pws[gid].i[0]; - w0[1] = pws[gid].i[1]; - w0[2] = pws[gid].i[2]; - w0[3] = pws[gid].i[3]; + w0[0] = hc_swap32_S (pws[gid].i[0]); + w0[1] = hc_swap32_S (pws[gid].i[1]); + w0[2] = hc_swap32_S (pws[gid].i[2]); + w0[3] = hc_swap32_S (pws[gid].i[3]); const u32 pw_len = pws[gid].pw_len & 63; @@ -702,12 +1090,13 @@ KERNEL_FQ void m07400_init (KERN_ATTR_TMPS (sha256crypt_tmp_t)) * salt */ - u32 salt_buf[4]; + u32 salt_buf[5]; - salt_buf[0] = salt_bufs[salt_pos].salt_buf[0]; - salt_buf[1] = salt_bufs[salt_pos].salt_buf[1]; - salt_buf[2] = salt_bufs[salt_pos].salt_buf[2]; - salt_buf[3] = salt_bufs[salt_pos].salt_buf[3]; + salt_buf[0] = hc_swap32_S (salt_bufs[salt_pos].salt_buf[0]); + salt_buf[1] = hc_swap32_S (salt_bufs[salt_pos].salt_buf[1]); + salt_buf[2] = hc_swap32_S (salt_bufs[salt_pos].salt_buf[2]); + salt_buf[3] = hc_swap32_S (salt_bufs[salt_pos].salt_buf[3]); + salt_buf[4] = hc_swap32_S (salt_bufs[salt_pos].salt_buf[4]); u32 salt_len = salt_bufs[salt_pos].salt_len; @@ -720,6 +1109,23 @@ KERNEL_FQ void m07400_init (KERN_ATTR_TMPS (sha256crypt_tmp_t)) u32 block[16]; + block[ 0] = 0; + block[ 1] = 0; + block[ 2] = 0; + block[ 3] = 0; + block[ 4] = 0; + block[ 5] = 0; + block[ 6] = 0; + block[ 7] = 0; + block[ 8] = 0; + block[ 9] = 0; + block[10] = 0; + block[11] = 0; + block[12] = 0; + block[13] = 0; + block[14] = 0; + block[15] = 0; + u32 alt_result[8]; u32 p_bytes[8]; u32 s_bytes[8]; @@ -728,33 +1134,25 @@ KERNEL_FQ void m07400_init (KERN_ATTR_TMPS (sha256crypt_tmp_t)) block_len = 0; - bzero16 (block); - /* Add key. */ block_len = memcat16 (block, block_len, w0, pw_len); /* Add salt. */ - block_len = memcat16 (block, block_len, salt_buf, salt_len); + block_len = memcat16s (block, block_len, salt_buf, salt_len); /* Add key again. */ block_len = memcat16 (block, block_len, w0, pw_len); - append_0x80_1x16 (block, block_len); + append_0x80_1x16 (block, block_len ^ 3); - block[15] = hc_swap32_S (block_len * 8); + block[15] = block_len * 8; init_ctx (alt_result); - sha256_transform_transport (block, alt_result); - - bswap8 (alt_result); - - block_len = 0; - - bzero16 (block); + sha256_transform (block + 0, block + 4, block + 8, block + 12, alt_result); u32 alt_result_tmp[8]; @@ -767,7 +1165,26 @@ KERNEL_FQ void m07400_init (KERN_ATTR_TMPS (sha256crypt_tmp_t)) alt_result_tmp[6] = 0; alt_result_tmp[7] = 0; - truncate_block_4x4_le_S (alt_result_tmp, pw_len); + truncate_block_4x4_be_S (alt_result_tmp, pw_len); + + block[ 0] = 0; + block[ 1] = 0; + block[ 2] = 0; + block[ 3] = 0; + block[ 4] = 0; + block[ 5] = 0; + block[ 6] = 0; + block[ 7] = 0; + block[ 8] = 0; + block[ 9] = 0; + block[10] = 0; + block[11] = 0; + block[12] = 0; + block[13] = 0; + block[14] = 0; + block[15] = 0; + + block_len = 0; /* Add the key string. */ @@ -777,7 +1194,7 @@ KERNEL_FQ void m07400_init (KERN_ATTR_TMPS (sha256crypt_tmp_t)) characters and it ends at the first `$' character (for compatibility with existing implementations). */ - block_len = memcat16 (block, block_len, salt_buf, salt_len); + block_len = memcat16s (block, block_len, salt_buf, salt_len); /* Now get result of this (32 bytes) and add it to the other context. */ @@ -817,20 +1234,33 @@ KERNEL_FQ void m07400_init (KERN_ATTR_TMPS (sha256crypt_tmp_t)) } } - append_0x80_1x16 (block, block_len); + append_0x80_1x16 (block, block_len ^ 3); if (block_len >= 56) { - sha256_transform_transport (block, alt_result); + sha256_transform (block + 0, block + 4, block + 8, block + 12, alt_result); - bzero16 (block); + block[ 0] = 0; + block[ 1] = 0; + block[ 2] = 0; + block[ 3] = 0; + block[ 4] = 0; + block[ 5] = 0; + block[ 6] = 0; + block[ 7] = 0; + block[ 8] = 0; + block[ 9] = 0; + block[10] = 0; + block[11] = 0; + block[12] = 0; + block[13] = 0; + block[14] = 0; + block[15] = 0; } - block[15] = hc_swap32_S (transform_len * 8); + block[15] = transform_len * 8; - sha256_transform_transport (block, alt_result); - - bswap8 (alt_result); + sha256_transform (block + 0, block + 4, block + 8, block + 12, alt_result); tmps[gid].alt_result[0] = alt_result[0]; tmps[gid].alt_result[1] = alt_result[1]; @@ -843,11 +1273,26 @@ KERNEL_FQ void m07400_init (KERN_ATTR_TMPS (sha256crypt_tmp_t)) /* Start computation of P byte sequence. */ - block_len = 0; - transform_len = 0; - bzero16 (block); + block[ 0] = 0; + block[ 1] = 0; + block[ 2] = 0; + block[ 3] = 0; + block[ 4] = 0; + block[ 5] = 0; + block[ 6] = 0; + block[ 7] = 0; + block[ 8] = 0; + block[ 9] = 0; + block[10] = 0; + block[11] = 0; + block[12] = 0; + block[13] = 0; + block[14] = 0; + block[15] = 0; + + block_len = 0; /* For every character in the password add the entire password. */ @@ -862,22 +1307,35 @@ KERNEL_FQ void m07400_init (KERN_ATTR_TMPS (sha256crypt_tmp_t)) /* Finish the digest. */ - append_0x80_1x16 (block, block_len); + append_0x80_1x16 (block, block_len ^ 3); if (block_len >= 56) { - sha256_transform_transport (block, p_bytes); + sha256_transform (block + 0, block + 4, block + 8, block + 12, p_bytes); - bzero16 (block); + block[ 0] = 0; + block[ 1] = 0; + block[ 2] = 0; + block[ 3] = 0; + block[ 4] = 0; + block[ 5] = 0; + block[ 6] = 0; + block[ 7] = 0; + block[ 8] = 0; + block[ 9] = 0; + block[10] = 0; + block[11] = 0; + block[12] = 0; + block[13] = 0; + block[14] = 0; + block[15] = 0; } - block[15] = hc_swap32_S (transform_len * 8); + block[15] = transform_len * 8; - sha256_transform_transport (block, p_bytes); + sha256_transform (block + 0, block + 4, block + 8, block + 12, p_bytes); - bswap8 (p_bytes); - - truncate_block_4x4_le_S (p_bytes, pw_len); + truncate_block_4x4_be_S (p_bytes, pw_len); tmps[gid].p_bytes[0] = p_bytes[0]; tmps[gid].p_bytes[1] = p_bytes[1]; @@ -886,46 +1344,75 @@ KERNEL_FQ void m07400_init (KERN_ATTR_TMPS (sha256crypt_tmp_t)) /* Start computation of S byte sequence. */ - block_len = 0; - transform_len = 0; - bzero16 (block); + block[ 0] = 0; + block[ 1] = 0; + block[ 2] = 0; + block[ 3] = 0; + block[ 4] = 0; + block[ 5] = 0; + block[ 6] = 0; + block[ 7] = 0; + block[ 8] = 0; + block[ 9] = 0; + block[10] = 0; + block[11] = 0; + block[12] = 0; + block[13] = 0; + block[14] = 0; + block[15] = 0; + + block_len = 0; /* For every character in the password add the entire password. */ init_ctx (s_bytes); - for (u32 j = 0; j < 16 + (alt_result[0] & 0xff); j++) + for (u32 j = 0; j < 16 + (alt_result[0] >> 24); j++) { - block_len = memcat16c (block, block_len, salt_buf, salt_len, s_bytes); + block_len = memcat16sc (block, block_len, salt_buf, salt_len, s_bytes); transform_len += salt_len; } /* Finish the digest. */ - append_0x80_1x16 (block, block_len); + append_0x80_1x16 (block, block_len ^ 3); if (block_len >= 56) { - sha256_transform_transport (block, s_bytes); + sha256_transform (block + 0, block + 4, block + 8, block + 12, s_bytes); - bzero16 (block); + block[ 0] = 0; + block[ 1] = 0; + block[ 2] = 0; + block[ 3] = 0; + block[ 4] = 0; + block[ 5] = 0; + block[ 6] = 0; + block[ 7] = 0; + block[ 8] = 0; + block[ 9] = 0; + block[10] = 0; + block[11] = 0; + block[12] = 0; + block[13] = 0; + block[14] = 0; + block[15] = 0; } - block[15] = hc_swap32_S (transform_len * 8); + block[15] = transform_len * 8; - sha256_transform_transport (block, s_bytes); + sha256_transform (block + 0, block + 4, block + 8, block + 12, s_bytes); - bswap8 (s_bytes); - - truncate_block_4x4_le_S (s_bytes, salt_len); + truncate_block_5x4_be_S (s_bytes, salt_len); tmps[gid].s_bytes[0] = s_bytes[0]; tmps[gid].s_bytes[1] = s_bytes[1]; tmps[gid].s_bytes[2] = s_bytes[2]; tmps[gid].s_bytes[3] = s_bytes[3]; + tmps[gid].s_bytes[4] = s_bytes[4]; } KERNEL_FQ void m07400_loop (KERN_ATTR_TMPS (sha256crypt_tmp_t)) @@ -958,14 +1445,15 @@ KERNEL_FQ void m07400_loop (KERN_ATTR_TMPS (sha256crypt_tmp_t)) p_bytes_x80[2] = tmps[gid].p_bytes[2]; p_bytes_x80[3] = tmps[gid].p_bytes[3]; - append_0x80_1x4 (p_bytes_x80, pw_len); + append_0x80_1x4_S (p_bytes_x80, pw_len ^ 3); - u32 s_bytes[4]; + u32 s_bytes[5]; s_bytes[0] = tmps[gid].s_bytes[0]; s_bytes[1] = tmps[gid].s_bytes[1]; s_bytes[2] = tmps[gid].s_bytes[2]; s_bytes[3] = tmps[gid].s_bytes[3]; + s_bytes[4] = tmps[gid].s_bytes[4]; // 4 extra bytes for MySQL 7.5+ hashes u32 alt_result[8]; @@ -991,9 +1479,6 @@ KERNEL_FQ void m07400_loop (KERN_ATTR_TMPS (sha256crypt_tmp_t)) u32 block[32]; - bzero16 (&block[ 0]); - bzero16 (&block[16]); - u32 block_len = 0; const u32 j1 = (j & 1) ? 1 : 0; @@ -1002,30 +1487,93 @@ KERNEL_FQ void m07400_loop (KERN_ATTR_TMPS (sha256crypt_tmp_t)) if (j1) { - block[0] = p_bytes[0]; - block[1] = p_bytes[1]; - block[2] = p_bytes[2]; - block[3] = p_bytes[3]; + block[ 0] = p_bytes[0]; + block[ 1] = p_bytes[1]; + block[ 2] = p_bytes[2]; + block[ 3] = p_bytes[3]; + block[ 4] = 0; + block[ 5] = 0; + block[ 6] = 0; + block[ 7] = 0; + block[ 8] = 0; + block[ 9] = 0; + block[10] = 0; + block[11] = 0; + block[12] = 0; + block[13] = 0; + block[14] = 0; + block[15] = 0; + block[16] = 0; + block[17] = 0; + block[18] = 0; + block[19] = 0; + block[20] = 0; + block[21] = 0; + block[22] = 0; + block[23] = 0; + block[24] = 0; + block[25] = 0; + block[26] = 0; + block[27] = 0; + block[28] = 0; + block[29] = 0; + block[30] = 0; + block[31] = 0; block_len = pw_len; + + if (j3) + { + block_len = memcat24 (block, block_len, s_bytes, salt_len); + } } else { - block[0] = alt_result[0]; - block[1] = alt_result[1]; - block[2] = alt_result[2]; - block[3] = alt_result[3]; - block[4] = alt_result[4]; - block[5] = alt_result[5]; - block[6] = alt_result[6]; - block[7] = alt_result[7]; + block[ 0] = alt_result[0]; + block[ 1] = alt_result[1]; + block[ 2] = alt_result[2]; + block[ 3] = alt_result[3]; + block[ 4] = alt_result[4]; + block[ 5] = alt_result[5]; + block[ 6] = alt_result[6]; + block[ 7] = alt_result[7]; + block[ 8] = 0; + block[ 9] = 0; + block[10] = 0; + block[11] = 0; + block[12] = 0; + block[13] = 0; + block[14] = 0; + block[15] = 0; + block[16] = 0; + block[17] = 0; + block[18] = 0; + block[19] = 0; + block[20] = 0; + block[21] = 0; + block[22] = 0; + block[23] = 0; + block[24] = 0; + block[25] = 0; + block[26] = 0; + block[27] = 0; + block[28] = 0; + block[29] = 0; + block[30] = 0; + block[31] = 0; block_len = 32; - } - if (j3) - { - block_len = memcat20 (block, block_len, s_bytes, salt_len); + if (j3) + { + block[ 8] = s_bytes[0]; + block[ 9] = s_bytes[1]; + block[10] = s_bytes[2]; + block[11] = s_bytes[3]; + block[12] = s_bytes[4]; + + block_len += salt_len; + } } if (j7) @@ -1045,31 +1593,30 @@ KERNEL_FQ void m07400_loop (KERN_ATTR_TMPS (sha256crypt_tmp_t)) if (block_len >= 56) { - sha256_transform_transport (block, tmp); + sha256_transform (block + 0, block + 4, block + 8, block + 12, tmp); block[ 0] = block[16]; block[ 1] = block[17]; block[ 2] = block[18]; block[ 3] = block[19]; - block[ 4] = 0; - block[ 5] = 0; - block[ 6] = 0; - block[ 7] = 0; - block[ 8] = 0; - block[ 9] = 0; - block[10] = 0; - block[11] = 0; - block[12] = 0; - block[13] = 0; - block[14] = 0; - block[15] = 0; + block[ 4] = block[20]; + block[ 5] = block[21]; + block[ 6] = block[22]; + block[ 7] = block[23]; + block[ 8] = block[24]; + block[ 9] = block[25]; + block[10] = block[26]; + block[11] = block[27]; + block[12] = block[28]; + block[13] = block[29]; + block[14] = block[30]; + block[15] = block[31]; } - block[15] = hc_swap32_S (block_len * 8); + block[14] = 0; + block[15] = block_len * 8; - sha256_transform_transport (block, tmp); - - bswap8 (tmp); + sha256_transform (block + 0, block + 4, block + 8, block + 12, tmp); alt_result[0] = tmp[0]; alt_result[1] = tmp[1]; @@ -1103,10 +1650,10 @@ KERNEL_FQ void m07400_comp (KERN_ATTR_TMPS (sha256crypt_tmp_t)) const u64 lid = get_local_id (0); - const u32 r0 = tmps[gid].alt_result[0]; - const u32 r1 = tmps[gid].alt_result[1]; - const u32 r2 = tmps[gid].alt_result[2]; - const u32 r3 = tmps[gid].alt_result[3]; + const u32 r0 = hc_swap32_S (tmps[gid].alt_result[0]); + const u32 r1 = hc_swap32_S (tmps[gid].alt_result[1]); + const u32 r2 = hc_swap32_S (tmps[gid].alt_result[2]); + const u32 r3 = hc_swap32_S (tmps[gid].alt_result[3]); #define il_pos 0