diff --git a/OpenCL/m00500-optimized.cl b/OpenCL/m00500-optimized.cl index 19f7153ff..38a361b96 100644 --- a/OpenCL/m00500-optimized.cl +++ b/OpenCL/m00500-optimized.cl @@ -32,7 +32,7 @@ DECLSPEC void memcat16 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, cons u32 tmp3; u32 tmp4; - #if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC u32 in0 = append[0]; u32 in1 = append[1]; u32 in2 = append[2]; @@ -45,12 +45,18 @@ DECLSPEC void memcat16 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, cons tmp4 = hc_bytealign (in3, 0, offset); #endif - #ifdef IS_NV + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV const int offset_mod_4 = offset & 3; const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; + #endif + + #if (defined IS_AMD || defined IS_HIP) + const int selector = l32_from_64_S (0x0706050403020100UL >> (offset_minus_4 * 8)); + #endif u32 in0 = append[0]; u32 in1 = append[1]; @@ -139,7 +145,7 @@ DECLSPEC void memcat16_x80 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, u32 tmp3; u32 tmp4; - #if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC u32 in0 = append[0]; u32 in1 = append[1]; u32 in2 = append[2]; @@ -153,12 +159,18 @@ DECLSPEC void memcat16_x80 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, tmp4 = hc_bytealign (in3, in4, offset); #endif - #ifdef IS_NV + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV const int offset_mod_4 = offset & 3; const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; + #endif + + #if (defined IS_AMD || defined IS_HIP) + const int selector = l32_from_64_S (0x0706050403020100UL >> (offset_minus_4 * 8)); + #endif u32 in0 = append[0]; u32 in1 = append[1]; @@ -246,7 +258,7 @@ DECLSPEC void memcat8 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, const u32 tmp1; u32 tmp2; - #if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC u32 in0 = append[0]; u32 in1 = append[1]; @@ -255,12 +267,18 @@ DECLSPEC void memcat8 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, const tmp2 = hc_bytealign (in1, 0, offset); #endif - #ifdef IS_NV + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV const int offset_mod_4 = offset & 3; const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; + #endif + + #if (defined IS_AMD || defined IS_HIP) + const int selector = l32_from_64_S (0x0706050403020100UL >> (offset_minus_4 * 8)); + #endif u32 in0 = append[0]; u32 in1 = append[1]; diff --git a/OpenCL/m01500_a3-pure.cl b/OpenCL/m01500_a3-pure.cl index c2c4245e1..7a5adf017 100644 --- a/OpenCL/m01500_a3-pure.cl +++ b/OpenCL/m01500_a3-pure.cl @@ -1664,18 +1664,18 @@ DECLSPEC void DESCrypt (const u32 SALT, const u32 K00, const u32 K01, const u32 DECLSPEC void DESCrypt (const u32 SALT, const u32 K00, const u32 K01, const u32 K02, const u32 K03, const u32 K04, const u32 K05, const u32 K06, const u32 K07, const u32 K08, const u32 K09, const u32 K10, const u32 K11, const u32 K12, const u32 K13, const u32 K14, const u32 K15, const u32 K16, const u32 K17, const u32 K18, const u32 K19, const u32 K20, const u32 K21, const u32 K22, const u32 K23, const u32 K24, const u32 K25, const u32 K26, const u32 K27, const u32 K28, const u32 K29, const u32 K30, const u32 K31, const u32 K32, const u32 K33, const u32 K34, const u32 K35, const u32 K36, const u32 K37, const u32 K38, const u32 K39, const u32 K40, const u32 K41, const u32 K42, const u32 K43, const u32 K44, const u32 K45, const u32 K46, const u32 K47, const u32 K48, const u32 K49, const u32 K50, const u32 K51, const u32 K52, const u32 K53, const u32 K54, const u32 K55, u32 *D00, u32 *D01, u32 *D02, u32 *D03, u32 *D04, u32 *D05, u32 *D06, u32 *D07, u32 *D08, u32 *D09, u32 *D10, u32 *D11, u32 *D12, u32 *D13, u32 *D14, u32 *D15, u32 *D16, u32 *D17, u32 *D18, u32 *D19, u32 *D20, u32 *D21, u32 *D22, u32 *D23, u32 *D24, u32 *D25, u32 *D26, u32 *D27, u32 *D28, u32 *D29, u32 *D30, u32 *D31, u32 *D32, u32 *D33, u32 *D34, u32 *D35, u32 *D36, u32 *D37, u32 *D38, u32 *D39, u32 *D40, u32 *D41, u32 *D42, u32 *D43, u32 *D44, u32 *D45, u32 *D46, u32 *D47, u32 *D48, u32 *D49, u32 *D50, u32 *D51, u32 *D52, u32 *D53, u32 *D54, u32 *D55, u32 *D56, u32 *D57, u32 *D58, u32 *D59, u32 *D60, u32 *D61, u32 *D62, u32 *D63) { - const u32 s001 = (0x001 & SALT) ? 0xffffffff : 0; - const u32 s002 = (0x002 & SALT) ? 0xffffffff : 0; - const u32 s004 = (0x004 & SALT) ? 0xffffffff : 0; - const u32 s008 = (0x008 & SALT) ? 0xffffffff : 0; - const u32 s010 = (0x010 & SALT) ? 0xffffffff : 0; - const u32 s020 = (0x020 & SALT) ? 0xffffffff : 0; - const u32 s040 = (0x040 & SALT) ? 0xffffffff : 0; - const u32 s080 = (0x080 & SALT) ? 0xffffffff : 0; - const u32 s100 = (0x100 & SALT) ? 0xffffffff : 0; - const u32 s200 = (0x200 & SALT) ? 0xffffffff : 0; - const u32 s400 = (0x400 & SALT) ? 0xffffffff : 0; - const u32 s800 = (0x800 & SALT) ? 0xffffffff : 0; + const u32 s001 = (0x001 & SALT) ? 1 : 0; + const u32 s002 = (0x002 & SALT) ? 1 : 0; + const u32 s004 = (0x004 & SALT) ? 1 : 0; + const u32 s008 = (0x008 & SALT) ? 1 : 0; + const u32 s010 = (0x010 & SALT) ? 1 : 0; + const u32 s020 = (0x020 & SALT) ? 1 : 0; + const u32 s040 = (0x040 & SALT) ? 1 : 0; + const u32 s080 = (0x080 & SALT) ? 1 : 0; + const u32 s100 = (0x100 & SALT) ? 1 : 0; + const u32 s200 = (0x200 & SALT) ? 1 : 0; + const u32 s400 = (0x400 & SALT) ? 1 : 0; + const u32 s800 = (0x800 & SALT) ? 1 : 0; KXX_DECL u32 k00, k01, k02, k03, k04, k05; KXX_DECL u32 k06, k07, k08, k09, k10, k11; diff --git a/OpenCL/m01600-optimized.cl b/OpenCL/m01600-optimized.cl index cfaad44cc..62194e973 100644 --- a/OpenCL/m01600-optimized.cl +++ b/OpenCL/m01600-optimized.cl @@ -31,7 +31,7 @@ DECLSPEC void memcat16 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, cons u32 tmp3; u32 tmp4; - #if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC u32 in0 = append[0]; u32 in1 = append[1]; u32 in2 = append[2]; @@ -44,12 +44,18 @@ DECLSPEC void memcat16 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, cons tmp4 = hc_bytealign (in3, 0, offset); #endif - #ifdef IS_NV + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV const int offset_mod_4 = offset & 3; const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; + #endif + + #if (defined IS_AMD || defined IS_HIP) + const int selector = l32_from_64_S (0x0706050403020100UL >> (offset_minus_4 * 8)); + #endif u32 in0 = append[0]; u32 in1 = append[1]; @@ -138,7 +144,7 @@ DECLSPEC void memcat16_x80 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, u32 tmp3; u32 tmp4; - #if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC u32 in0 = append[0]; u32 in1 = append[1]; u32 in2 = append[2]; @@ -152,12 +158,18 @@ DECLSPEC void memcat16_x80 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, tmp4 = hc_bytealign (in3, in4, offset); #endif - #ifdef IS_NV + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV const int offset_mod_4 = offset & 3; const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; + #endif + + #if (defined IS_AMD || defined IS_HIP) + const int selector = l32_from_64_S (0x0706050403020100UL >> (offset_minus_4 * 8)); + #endif u32 in0 = append[0]; u32 in1 = append[1]; @@ -245,7 +257,7 @@ DECLSPEC void memcat8 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, const u32 tmp1; u32 tmp2; - #if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC u32 in0 = append[0]; u32 in1 = append[1]; @@ -254,12 +266,18 @@ DECLSPEC void memcat8 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, const tmp2 = hc_bytealign (in1, 0, offset); #endif - #ifdef IS_NV + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV const int offset_mod_4 = offset & 3; const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; + #endif + + #if (defined IS_AMD || defined IS_HIP) + const int selector = l32_from_64_S (0x0706050403020100UL >> (offset_minus_4 * 8)); + #endif u32 in0 = append[0]; u32 in1 = append[1]; diff --git a/OpenCL/m05800-optimized.cl b/OpenCL/m05800-optimized.cl index 38099159f..9f8f5a3cc 100644 --- a/OpenCL/m05800-optimized.cl +++ b/OpenCL/m05800-optimized.cl @@ -2119,7 +2119,7 @@ DECLSPEC void append_salt (u32 *w0, u32 *w1, u32 *w2, const u32 *append, const u u32 tmp4; u32 tmp5; - #if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC u32 in0 = append[0]; u32 in1 = append[1]; u32 in2 = append[2]; @@ -2134,12 +2134,18 @@ DECLSPEC void append_salt (u32 *w0, u32 *w1, u32 *w2, const u32 *append, const u tmp5 = hc_bytealign (in4, 0, offset); #endif - #ifdef IS_NV + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV const int offset_mod_4 = offset & 3; const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; + #endif + + #if (defined IS_AMD || defined IS_HIP) + const int selector = l32_from_64_S (0x0706050403020100UL >> (offset_minus_4 * 8)); + #endif u32 in0 = append[0]; u32 in1 = append[1]; diff --git a/OpenCL/m06300-optimized.cl b/OpenCL/m06300-optimized.cl index b7c9ddddd..f242259da 100644 --- a/OpenCL/m06300-optimized.cl +++ b/OpenCL/m06300-optimized.cl @@ -28,7 +28,7 @@ DECLSPEC void memcat16 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, cons u32 tmp3; u32 tmp4; - #if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC u32 in0 = append[0]; u32 in1 = append[1]; u32 in2 = append[2]; @@ -41,12 +41,18 @@ DECLSPEC void memcat16 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, cons tmp4 = hc_bytealign (in3, 0, offset); #endif - #ifdef IS_NV + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV const int offset_mod_4 = offset & 3; const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; + #endif + + #if (defined IS_AMD || defined IS_HIP) + const int selector = l32_from_64_S (0x0706050403020100UL >> (offset_minus_4 * 8)); + #endif u32 in0 = append[0]; u32 in1 = append[1]; @@ -135,7 +141,7 @@ DECLSPEC void memcat16_x80 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, u32 tmp3; u32 tmp4; - #if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC u32 in0 = append[0]; u32 in1 = append[1]; u32 in2 = append[2]; @@ -149,12 +155,18 @@ DECLSPEC void memcat16_x80 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, tmp4 = hc_bytealign (in3, in4, offset); #endif - #ifdef IS_NV + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV const int offset_mod_4 = offset & 3; const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; + #endif + + #if (defined IS_AMD || defined IS_HIP) + const int selector = l32_from_64_S (0x0706050403020100UL >> (offset_minus_4 * 8)); + #endif u32 in0 = append[0]; u32 in1 = append[1]; @@ -242,7 +254,7 @@ DECLSPEC void memcat8 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, const u32 tmp1; u32 tmp2; - #if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC u32 in0 = append[0]; u32 in1 = append[1]; @@ -251,12 +263,18 @@ DECLSPEC void memcat8 (u32 *block0, u32 *block1, u32 *block2, u32 *block3, const tmp2 = hc_bytealign (in1, 0, offset); #endif - #ifdef IS_NV + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV const int offset_mod_4 = offset & 3; const int offset_minus_4 = 4 - offset_mod_4; + #if defined IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; + #endif + + #if (defined IS_AMD || defined IS_HIP) + const int selector = l32_from_64_S (0x0706050403020100UL >> (offset_minus_4 * 8)); + #endif u32 in0 = append[0]; u32 in1 = append[1]; diff --git a/OpenCL/m07400-optimized.cl b/OpenCL/m07400-optimized.cl index 7efa5c94e..5fb83a2ad 100644 --- a/OpenCL/m07400-optimized.cl +++ b/OpenCL/m07400-optimized.cl @@ -45,7 +45,7 @@ DECLSPEC u32 memcat16 (u32 *block, const u32 offset, const u32 *append, const u3 u32 in2 = append[2]; u32 in3 = append[3]; - #if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC const u32 tmp0 = hc_bytealign_be ( 0, in0, offset); const u32 tmp1 = hc_bytealign_be (in0, in1, offset); const u32 tmp2 = hc_bytealign_be (in1, in2, offset); @@ -53,8 +53,15 @@ DECLSPEC u32 memcat16 (u32 *block, const u32 offset, const u32 *append, const u3 const u32 tmp4 = hc_bytealign_be (in3, 0, offset); #endif - #ifdef IS_NV + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV + + #if defined IS_NV const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; + #endif + + #if (defined IS_AMD || defined IS_HIP) + const int selector = l32_from_64_S (0x0706050403020100UL >> ((offset & 3) * 8)); + #endif const u32 tmp0 = hc_byte_perm_S (in0, 0, selector); const u32 tmp1 = hc_byte_perm_S (in1, in0, selector); @@ -165,7 +172,7 @@ DECLSPEC u32 memcat16c (u32 *block, const u32 offset, const u32 *append, const u u32 in2 = append[2]; u32 in3 = append[3]; - #if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC const u32 tmp0 = hc_bytealign_be ( 0, in0, offset); const u32 tmp1 = hc_bytealign_be (in0, in1, offset); const u32 tmp2 = hc_bytealign_be (in1, in2, offset); @@ -173,8 +180,15 @@ DECLSPEC u32 memcat16c (u32 *block, const u32 offset, const u32 *append, const u const u32 tmp4 = hc_bytealign_be (in3, 0, offset); #endif - #ifdef IS_NV + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV + + #if defined IS_NV const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; + #endif + + #if (defined IS_AMD || defined IS_HIP) + const int selector = l32_from_64_S (0x0706050403020100UL >> ((offset & 3) * 8)); + #endif const u32 tmp0 = hc_byte_perm_S (in0, 0, selector); const u32 tmp1 = hc_byte_perm_S (in1, in0, selector); @@ -322,7 +336,7 @@ DECLSPEC u32 memcat16s (u32 *block, const u32 offset, const u32 *append, const u u32 in3 = append[3]; u32 in4 = append[4]; - #if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC const u32 tmp0 = hc_bytealign_be ( 0, in0, offset); const u32 tmp1 = hc_bytealign_be (in0, in1, offset); const u32 tmp2 = hc_bytealign_be (in1, in2, offset); @@ -331,8 +345,15 @@ DECLSPEC u32 memcat16s (u32 *block, const u32 offset, const u32 *append, const u const u32 tmp5 = hc_bytealign_be (in4, 0, offset); #endif - #ifdef IS_NV + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV + + #if defined IS_NV const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; + #endif + + #if (defined IS_AMD || defined IS_HIP) + const int selector = l32_from_64_S (0x0706050403020100UL >> ((offset & 3) * 8)); + #endif const u32 tmp0 = hc_byte_perm_S (in0, 0, selector); const u32 tmp1 = hc_byte_perm_S (in1, in0, selector); @@ -456,7 +477,7 @@ DECLSPEC u32 memcat16sc (u32 *block, const u32 offset, const u32 *append, const u32 in3 = append[3]; u32 in4 = append[4]; - #if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC const u32 tmp0 = hc_bytealign_be ( 0, in0, offset); const u32 tmp1 = hc_bytealign_be (in0, in1, offset); const u32 tmp2 = hc_bytealign_be (in1, in2, offset); @@ -465,8 +486,15 @@ DECLSPEC u32 memcat16sc (u32 *block, const u32 offset, const u32 *append, const const u32 tmp5 = hc_bytealign_be (in4, 0, offset); #endif - #ifdef IS_NV + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV + + #if defined IS_NV const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; + #endif + + #if (defined IS_AMD || defined IS_HIP) + const int selector = l32_from_64_S (0x0706050403020100UL >> ((offset & 3) * 8)); + #endif const u32 tmp0 = hc_byte_perm_S (in0, 0, selector); const u32 tmp1 = hc_byte_perm_S (in1, in0, selector); @@ -756,7 +784,7 @@ DECLSPEC u32 memcat20 (u32 *block, const u32 offset, const u32 *append, const u3 u32 in2 = append[2]; u32 in3 = append[3]; - #if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC const u32 tmp0 = hc_bytealign_be_S ( 0, in0, offset); const u32 tmp1 = hc_bytealign_be_S (in0, in1, offset); const u32 tmp2 = hc_bytealign_be_S (in1, in2, offset); @@ -764,8 +792,15 @@ DECLSPEC u32 memcat20 (u32 *block, const u32 offset, const u32 *append, const u3 const u32 tmp4 = hc_bytealign_be_S (in3, 0, offset); #endif - #ifdef IS_NV + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV + + #if defined IS_NV const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; + #endif + + #if (defined IS_AMD || defined IS_HIP) + const int selector = l32_from_64_S (0x0706050403020100UL >> ((offset & 3) * 8)); + #endif const u32 tmp0 = hc_byte_perm_S (in0, 0, selector); const u32 tmp1 = hc_byte_perm_S (in1, in0, selector); @@ -915,7 +950,7 @@ DECLSPEC u32 memcat20_x80 (u32 *block, const u32 offset, const u32 *append, cons u32 in3 = append[3]; u32 in4 = 0x80000000; - #if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC const u32 tmp0 = hc_bytealign_be_S ( 0, in0, offset); const u32 tmp1 = hc_bytealign_be_S (in0, in1, offset); const u32 tmp2 = hc_bytealign_be_S (in1, in2, offset); @@ -923,8 +958,15 @@ DECLSPEC u32 memcat20_x80 (u32 *block, const u32 offset, const u32 *append, cons const u32 tmp4 = hc_bytealign_be_S (in3, in4, offset); #endif - #ifdef IS_NV + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV + + #if defined IS_NV const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; + #endif + + #if (defined IS_AMD || defined IS_HIP) + const int selector = l32_from_64_S (0x0706050403020100UL >> ((offset & 3) * 8)); + #endif const u32 tmp0 = hc_byte_perm_S (in0, 0, selector); const u32 tmp1 = hc_byte_perm_S (in1, in0, selector); @@ -1074,7 +1116,7 @@ DECLSPEC u32 memcat24 (u32 *block, const u32 offset, const u32 *append, const u3 u32 in3 = append[3]; u32 in4 = append[4]; - #if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC const u32 tmp0 = hc_bytealign_be_S ( 0, in0, offset); const u32 tmp1 = hc_bytealign_be_S (in0, in1, offset); const u32 tmp2 = hc_bytealign_be_S (in1, in2, offset); @@ -1083,8 +1125,15 @@ DECLSPEC u32 memcat24 (u32 *block, const u32 offset, const u32 *append, const u3 const u32 tmp5 = hc_bytealign_be_S (in4, 0, offset); #endif - #ifdef IS_NV + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV + + #if defined IS_NV const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; + #endif + + #if (defined IS_AMD || defined IS_HIP) + const int selector = l32_from_64_S (0x0706050403020100UL >> ((offset & 3) * 8)); + #endif const u32 tmp0 = hc_byte_perm_S (in0, 0, selector); const u32 tmp1 = hc_byte_perm_S (in1, in0, selector); diff --git a/OpenCL/m10700-optimized.cl b/OpenCL/m10700-optimized.cl index a9b50a6ac..9779c8fe6 100644 --- a/OpenCL/m10700-optimized.cl +++ b/OpenCL/m10700-optimized.cl @@ -232,7 +232,7 @@ DECLSPEC void make_sc (u32 *sc, const u32 *pw, const u32 pw_len, const u32 *bl, u32 i; - #if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC for (i = 0; i < pd; i++) sc[idx++] = pw[i]; sc[idx++] = pw[i] | hc_bytealign_be (bl[0], 0, pm4); @@ -242,8 +242,15 @@ DECLSPEC void make_sc (u32 *sc, const u32 *pw, const u32 pw_len, const u32 *bl, sc[idx++] = hc_bytealign_be ( 0, sc[i - 1], pm4); #endif - #ifdef IS_NV - int selector = (0x76543210 >> (pm4 * 4)) & 0xffff; + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV + + #if defined IS_NV + const int selector = (0x76543210 >> ((pm4 & 3) * 4)) & 0xffff; + #endif + + #if (defined IS_AMD || defined IS_HIP) + const int selector = l32_from_64_S (0x0706050403020100UL >> ((pm4 & 3) * 8)); + #endif for (i = 0; i < pd; i++) sc[idx++] = pw[i]; sc[idx++] = pw[i] @@ -263,16 +270,22 @@ DECLSPEC void make_pt_with_offset (u32 *pt, const u32 offset, const u32 *sc, con const u32 om = m % 4; const u32 od = m / 4; - #if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC pt[0] = hc_bytealign_be (sc[od + 1], sc[od + 0], om); pt[1] = hc_bytealign_be (sc[od + 2], sc[od + 1], om); pt[2] = hc_bytealign_be (sc[od + 3], sc[od + 2], om); pt[3] = hc_bytealign_be (sc[od + 4], sc[od + 3], om); #endif - #ifdef IS_NV - int selector = (0x76543210 >> (om * 4)) & 0xffff; + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV + #if defined IS_NV + const int selector = (0x76543210 >> ((om & 3) * 4)) & 0xffff; + #endif + + #if (defined IS_AMD || defined IS_HIP) + const int selector = l32_from_64_S (0x0706050403020100UL >> ((om & 3) * 8)); + #endif pt[0] = hc_byte_perm (sc[od + 0], sc[od + 1], selector); pt[1] = hc_byte_perm (sc[od + 1], sc[od + 2], selector); pt[2] = hc_byte_perm (sc[od + 2], sc[od + 3], selector); diff --git a/OpenCL/m11600-pure.cl b/OpenCL/m11600-pure.cl index be42e185b..d321aee3a 100644 --- a/OpenCL/m11600-pure.cl +++ b/OpenCL/m11600-pure.cl @@ -42,13 +42,20 @@ DECLSPEC void memcat8c_be (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 len, co u32 tmp0; u32 tmp1; - #if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC tmp0 = hc_bytealign_be (0, append, func_len); tmp1 = hc_bytealign_be (append, 0, func_len); #endif - #ifdef IS_NV + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV + + #if defined IS_NV const int selector = (0x76543210 >> ((func_len & 3) * 4)) & 0xffff; + #endif + + #if (defined IS_AMD || defined IS_HIP) + const int selector = l32_from_64_S (0x0706050403020100UL >> ((func_len & 3) * 8)); + #endif tmp0 = hc_byte_perm (append, 0, selector); tmp1 = hc_byte_perm (0, append, selector); diff --git a/OpenCL/m12500-pure.cl b/OpenCL/m12500-pure.cl index f8ed47771..6112ec296 100644 --- a/OpenCL/m12500-pure.cl +++ b/OpenCL/m12500-pure.cl @@ -37,13 +37,20 @@ DECLSPEC void memcat8c_be (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 len, co u32 tmp0; u32 tmp1; - #if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC tmp0 = hc_bytealign_be (0, append, func_len); tmp1 = hc_bytealign_be (append, 0, func_len); #endif - #ifdef IS_NV + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV + + #if defined IS_NV const int selector = (0x76543210 >> ((func_len & 3) * 4)) & 0xffff; + #endif + + #if (defined IS_AMD || defined IS_HIP) + const int selector = l32_from_64_S (0x0706050403020100UL >> ((func_len & 3) * 8)); + #endif tmp0 = hc_byte_perm (append, 0, selector); tmp1 = hc_byte_perm (0, append, selector); diff --git a/OpenCL/m13800_a0-optimized.cl b/OpenCL/m13800_a0-optimized.cl index 6758ffbd4..043ed0d13 100644 --- a/OpenCL/m13800_a0-optimized.cl +++ b/OpenCL/m13800_a0-optimized.cl @@ -51,7 +51,7 @@ DECLSPEC void memcat64c_be (u32x *block, const u32 offset, u32x *carry) u32x tmp15; u32x tmp16; - #if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC tmp00 = hc_bytealign_be ( 0, carry[ 0], offset); tmp01 = hc_bytealign_be (carry[ 0], carry[ 1], offset); tmp02 = hc_bytealign_be (carry[ 1], carry[ 2], offset); @@ -71,8 +71,15 @@ DECLSPEC void memcat64c_be (u32x *block, const u32 offset, u32x *carry) tmp16 = hc_bytealign_be (carry[15], 0, offset); #endif - #ifdef IS_NV + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV + + #if defined IS_NV const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; + #endif + + #if (defined IS_AMD || defined IS_HIP) + const int selector = l32_from_64_S (0x0706050403020100UL >> ((offset & 3) * 8)); + #endif tmp00 = hc_byte_perm (carry[ 0], 0, selector); tmp01 = hc_byte_perm (carry[ 1], carry[ 0], selector); diff --git a/OpenCL/m13800_a1-optimized.cl b/OpenCL/m13800_a1-optimized.cl index 85e711b94..4227e48d5 100644 --- a/OpenCL/m13800_a1-optimized.cl +++ b/OpenCL/m13800_a1-optimized.cl @@ -49,7 +49,7 @@ DECLSPEC void memcat64c_be (u32x *block, const u32 offset, u32x *carry) u32x tmp15; u32x tmp16; - #if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC tmp00 = hc_bytealign_be ( 0, carry[ 0], offset); tmp01 = hc_bytealign_be (carry[ 0], carry[ 1], offset); tmp02 = hc_bytealign_be (carry[ 1], carry[ 2], offset); @@ -69,8 +69,15 @@ DECLSPEC void memcat64c_be (u32x *block, const u32 offset, u32x *carry) tmp16 = hc_bytealign_be (carry[15], 0, offset); #endif - #ifdef IS_NV + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV + + #if defined IS_NV const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; + #endif + + #if (defined IS_AMD || defined IS_HIP) + const int selector = l32_from_64_S (0x0706050403020100UL >> ((offset & 3) * 8)); + #endif tmp00 = hc_byte_perm (carry[ 0], 0, selector); tmp01 = hc_byte_perm (carry[ 1], carry[ 0], selector); diff --git a/OpenCL/m13800_a3-optimized.cl b/OpenCL/m13800_a3-optimized.cl index 65b759de0..895d4378c 100644 --- a/OpenCL/m13800_a3-optimized.cl +++ b/OpenCL/m13800_a3-optimized.cl @@ -48,7 +48,7 @@ DECLSPEC void memcat64c_be (u32x *block, const u32 offset, u32x *carry) u32x tmp15; u32x tmp16; - #if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC tmp00 = hc_bytealign_be ( 0, carry[ 0], offset); tmp01 = hc_bytealign_be (carry[ 0], carry[ 1], offset); tmp02 = hc_bytealign_be (carry[ 1], carry[ 2], offset); @@ -68,8 +68,15 @@ DECLSPEC void memcat64c_be (u32x *block, const u32 offset, u32x *carry) tmp16 = hc_bytealign_be (carry[15], 0, offset); #endif - #ifdef IS_NV + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV + + #if defined IS_NV const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; + #endif + + #if (defined IS_AMD || defined IS_HIP) + const int selector = l32_from_64_S (0x0706050403020100UL >> ((offset & 3) * 8)); + #endif tmp00 = hc_byte_perm (carry[ 0], 0, selector); tmp01 = hc_byte_perm (carry[ 1], carry[ 0], selector); diff --git a/OpenCL/m23700-pure.cl b/OpenCL/m23700-pure.cl index af287574e..63e84cbf7 100644 --- a/OpenCL/m23700-pure.cl +++ b/OpenCL/m23700-pure.cl @@ -145,13 +145,20 @@ DECLSPEC void memcat8c_be (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 len, co u32 tmp0; u32 tmp1; - #if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC tmp0 = hc_bytealign_be (0, append, func_len); tmp1 = hc_bytealign_be (append, 0, func_len); #endif - #ifdef IS_NV + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV + + #if defined IS_NV const int selector = (0x76543210 >> ((func_len & 3) * 4)) & 0xffff; + #endif + + #if (defined IS_AMD || defined IS_HIP) + const int selector = l32_from_64_S (0x0706050403020100UL >> ((func_len & 3) * 8)); + #endif tmp0 = hc_byte_perm (append, 0, selector); tmp1 = hc_byte_perm (0, append, selector); diff --git a/OpenCL/m23800-pure.cl b/OpenCL/m23800-pure.cl index f6d345677..530c3268d 100644 --- a/OpenCL/m23800-pure.cl +++ b/OpenCL/m23800-pure.cl @@ -56,13 +56,20 @@ DECLSPEC void memcat8c_be (u32 *w0, u32 *w1, u32 *w2, u32 *w3, const u32 len, co u32 tmp0; u32 tmp1; - #if (defined IS_AMD || defined IS_HIP) || defined IS_GENERIC + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 0) || defined IS_GENERIC tmp0 = hc_bytealign_be (0, append, func_len); tmp1 = hc_bytealign_be (append, 0, func_len); #endif - #ifdef IS_NV + #if ((defined IS_AMD || defined IS_HIP) && HAS_VPERM == 1) || defined IS_NV + + #if defined IS_NV const int selector = (0x76543210 >> ((func_len & 3) * 4)) & 0xffff; + #endif + + #if (defined IS_AMD || defined IS_HIP) + const int selector = l32_from_64_S (0x0706050403020100UL >> ((func_len & 3) * 8)); + #endif tmp0 = hc_byte_perm (append, 0, selector); tmp1 = hc_byte_perm (0, append, selector); diff --git a/docs/changes.txt b/docs/changes.txt index 7e0aab1a2..06b7f03ab 100644 --- a/docs/changes.txt +++ b/docs/changes.txt @@ -18,6 +18,7 @@ ## Improvements ## +- AMD GPUs: Add inline assembly code for md5crypt/sha256crypt, PDF 1.7, 7-Zip, RAR3, Samsung Android and Windows Phone 8+ - Blake Kernels: Optimize BLAKE2B_ROUND() 64 bit rotates giving a 5% performance increase - Brain Session: Adds hashconfig specific opti_type and opts_type parameters to hashcat session computation to cover features like -O and -M - Kernel Threads: Use warp size / wavefront size query instead of hardcoded values as base for kernel threads @@ -28,6 +29,7 @@ ## Technical ## +- ADL: Updated support for AMD Display Library to 14.0, updated datatypes and added support for OverDrive 7 and 8 based GPUs - Commandline: Throw an error if separator character given by the user with -p option is not exactly 1 byte - Kernel Cache: Add kernel threads into hash computation which is later used in the kernel cache filename - HIP Kernels: Got rid of hip/hip_runtime.h dependancy to enable more easy integration of the HIP backend on Windows diff --git a/src/modules/module_01500.c b/src/modules/module_01500.c index ea01dab96..dc7b7b47e 100644 --- a/src/modules/module_01500.c +++ b/src/modules/module_01500.c @@ -184,7 +184,11 @@ char *module_jit_build_options (MAYBE_UNUSED const hashconfig_t *hashconfig, MAY { if ((user_options->attack_mode == ATTACK_MODE_BF) && (hashes->salts_cnt == 1) && (user_options->slow_candidates == false)) { - hc_asprintf (&jit_build_options, "-DDESCRYPT_SALT=%u -D _unroll", hashes->salts_buf[0].salt_buf[0] & 0xfff); + hc_asprintf (&jit_build_options, "-DDESCRYPT_SALT=%u -D _unroll -fno-experimental-new-pass-manager", hashes->salts_buf[0].salt_buf[0] & 0xfff); + } + else + { + hc_asprintf (&jit_build_options, "-D _unroll -fno-experimental-new-pass-manager"); } } else