1
0
mirror of https://github.com/hashcat/hashcat.git synced 2025-02-08 21:52:51 +00:00

Add fine-tuned AMD GCN control macros

This commit is contained in:
jsteube 2017-09-07 20:33:43 +02:00
parent 3b89153c2d
commit ac9f1da747
4 changed files with 103 additions and 139 deletions

View File

@ -225,7 +225,7 @@ static void make_utf16be (const u32x in[4], u32x out1[4], u32x out2[4])
out1[1] = __byte_perm (in[0], 0, 0x3727); out1[1] = __byte_perm (in[0], 0, 0x3727);
out1[0] = __byte_perm (in[0], 0, 0x1707); out1[0] = __byte_perm (in[0], 0, 0x1707);
#elif defined IS_AMD_ROCM #elif defined IS_AMD && AMD_GCN >= 3
out2[3] = __byte_perm (in[3], 0, 0x03070207); out2[3] = __byte_perm (in[3], 0, 0x03070207);
out2[2] = __byte_perm (in[3], 0, 0x01070007); out2[2] = __byte_perm (in[3], 0, 0x01070007);
@ -263,7 +263,7 @@ static void make_utf16beN (const u32x in[4], u32x out1[4], u32x out2[4])
out1[1] = __byte_perm (in[0], 0, 0x1707); out1[1] = __byte_perm (in[0], 0, 0x1707);
out1[0] = __byte_perm (in[0], 0, 0x3727); out1[0] = __byte_perm (in[0], 0, 0x3727);
#elif defined IS_AMD_ROCM #elif defined IS_AMD && AMD_GCN >= 3
out2[3] = __byte_perm (in[3], 0, 0x01070007); out2[3] = __byte_perm (in[3], 0, 0x01070007);
out2[2] = __byte_perm (in[3], 0, 0x03070207); out2[2] = __byte_perm (in[3], 0, 0x03070207);
@ -301,7 +301,7 @@ static void make_utf16le (const u32x in[4], u32x out1[4], u32x out2[4])
out1[1] = __byte_perm (in[0], 0, 0x7372); out1[1] = __byte_perm (in[0], 0, 0x7372);
out1[0] = __byte_perm (in[0], 0, 0x7170); out1[0] = __byte_perm (in[0], 0, 0x7170);
#elif defined IS_AMD_ROCM #elif defined IS_AMD && AMD_GCN >= 3
out2[3] = __byte_perm (in[3], 0, 0x07030702); out2[3] = __byte_perm (in[3], 0, 0x07030702);
out2[2] = __byte_perm (in[3], 0, 0x07010700); out2[2] = __byte_perm (in[3], 0, 0x07010700);
@ -339,7 +339,7 @@ static void make_utf16leN (const u32x in[4], u32x out1[4], u32x out2[4])
out1[1] = __byte_perm (in[0], 0, 0x7170); out1[1] = __byte_perm (in[0], 0, 0x7170);
out1[0] = __byte_perm (in[0], 0, 0x7372); out1[0] = __byte_perm (in[0], 0, 0x7372);
#elif defined IS_AMD_ROCM #elif defined IS_AMD && AMD_GCN >= 3
out2[3] = __byte_perm (in[3], 0, 0x07010700); out2[3] = __byte_perm (in[3], 0, 0x07010700);
out2[2] = __byte_perm (in[3], 0, 0x07030702); out2[2] = __byte_perm (in[3], 0, 0x07030702);
@ -373,7 +373,7 @@ static void undo_utf16be (const u32x in1[4], const u32x in2[4], u32x out[4])
out[2] = __byte_perm (in2[0], in2[1], 0x4602); out[2] = __byte_perm (in2[0], in2[1], 0x4602);
out[3] = __byte_perm (in2[2], in2[3], 0x4602); out[3] = __byte_perm (in2[2], in2[3], 0x4602);
#elif defined IS_AMD_ROCM #elif defined IS_AMD && AMD_GCN >= 3
out[0] = __byte_perm (in1[0], in1[1], 0x04060002); out[0] = __byte_perm (in1[0], in1[1], 0x04060002);
out[1] = __byte_perm (in1[2], in1[3], 0x04060002); out[1] = __byte_perm (in1[2], in1[3], 0x04060002);
@ -403,7 +403,7 @@ static void undo_utf16le (const u32x in1[4], const u32x in2[4], u32x out[4])
out[2] = __byte_perm (in2[0], in2[1], 0x6420); out[2] = __byte_perm (in2[0], in2[1], 0x6420);
out[3] = __byte_perm (in2[2], in2[3], 0x6420); out[3] = __byte_perm (in2[2], in2[3], 0x6420);
#elif defined IS_AMD_ROCM #elif defined IS_AMD && AMD_GCN >= 3
out[0] = __byte_perm (in1[0], in1[1], 0x06040200); out[0] = __byte_perm (in1[0], in1[1], 0x06040200);
out[1] = __byte_perm (in1[2], in1[3], 0x06040200); out[1] = __byte_perm (in1[2], in1[3], 0x06040200);
@ -1266,7 +1266,7 @@ static void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x
const int offset_minus_4 = 4 - offset_mod_4; const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_AMD_LEGACY || defined IS_GENERIC #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
w0[0] = swap32 (w0[0]); w0[0] = swap32 (w0[0]);
w0[1] = swap32 (w0[1]); w0[1] = swap32 (w0[1]);
w0[2] = swap32 (w0[2]); w0[2] = swap32 (w0[2]);
@ -1625,13 +1625,13 @@ static void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x
w3[3] = swap32 (w3[3]); w3[3] = swap32 (w3[3]);
#endif #endif
#if defined IS_AMD_ROCM || defined IS_NV #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
#if defined IS_NV #if defined IS_NV
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
#endif #endif
#if defined IS_AMD_ROCM #if defined IS_AMD
const int selector = 0x0706050403020100 >> (offset_minus_4 * 8); const int selector = 0x0706050403020100 >> (offset_minus_4 * 8);
#endif #endif
@ -3279,7 +3279,7 @@ static void switch_buffer_by_offset_carry_le (u32x w0[4], u32x w1[4], u32x w2[4]
static void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset) static void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
{ {
#if defined IS_AMD_LEGACY || defined IS_GENERIC #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
switch (offset / 4) switch (offset / 4)
{ {
@ -3606,13 +3606,13 @@ static void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x
#endif #endif
#if defined IS_AMD_ROCM || defined IS_NV #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
#if defined IS_NV #if defined IS_NV
const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
#endif #endif
#if defined IS_AMD_ROCM #if defined IS_AMD
const int selector = 0x0706050403020100 >> ((offset & 3) * 8); const int selector = 0x0706050403020100 >> ((offset & 3) * 8);
#endif #endif
@ -3944,7 +3944,7 @@ static void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x
static void switch_buffer_by_offset_carry_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x c0[4], u32x c1[4], u32x c2[4], u32x c3[4], const u32 offset) static void switch_buffer_by_offset_carry_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x c0[4], u32x c1[4], u32x c2[4], u32x c3[4], const u32 offset)
{ {
#if defined IS_AMD_LEGACY || defined IS_GENERIC #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
switch (offset / 4) switch (offset / 4)
{ {
case 0: case 0:
@ -4405,13 +4405,13 @@ static void switch_buffer_by_offset_carry_be (u32x w0[4], u32x w1[4], u32x w2[4]
} }
#endif #endif
#if defined IS_AMD_ROCM || defined IS_NV #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
#if defined IS_NV #if defined IS_NV
const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
#endif #endif
#if defined IS_AMD_ROCM #if defined IS_AMD
const int selector = 0x0706050403020100 >> ((offset & 3) * 8); const int selector = 0x0706050403020100 >> ((offset & 3) * 8);
#endif #endif
@ -4882,7 +4882,7 @@ static void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4],
const int offset_minus_4 = 4 - offset_mod_4; const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_AMD_LEGACY || defined IS_GENERIC #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
w0[0] = swap32 (w0[0]); w0[0] = swap32 (w0[0]);
w0[1] = swap32 (w0[1]); w0[1] = swap32 (w0[1]);
w0[2] = swap32 (w0[2]); w0[2] = swap32 (w0[2]);
@ -6105,13 +6105,13 @@ static void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4],
w7[3] = swap32 (w7[3]); w7[3] = swap32 (w7[3]);
#endif #endif
#if defined IS_AMD_ROCM || defined IS_NV #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
#if defined IS_NV #if defined IS_NV
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
#endif #endif
#if defined IS_AMD_ROCM #if defined IS_AMD
const int selector = 0x0706050403020100 >> (offset_minus_4 * 8); const int selector = 0x0706050403020100 >> (offset_minus_4 * 8);
#endif #endif
@ -6682,7 +6682,7 @@ static void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4],
static void switch_buffer_by_offset_8x4_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset) static void switch_buffer_by_offset_8x4_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset)
{ {
#if defined IS_AMD_LEGACY || defined IS_GENERIC #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
switch (offset / 4) switch (offset / 4)
{ {
case 0: case 0:
@ -7839,13 +7839,13 @@ static void switch_buffer_by_offset_8x4_be (u32x w0[4], u32x w1[4], u32x w2[4],
} }
#endif #endif
#if defined IS_AMD_ROCM || defined IS_NV #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
#if defined IS_NV #if defined IS_NV
const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
#endif #endif
#if defined IS_AMD_ROCM #if defined IS_AMD
const int selector = 0x0706050403020100 >> ((offset & 3) * 8); const int selector = 0x0706050403020100 >> ((offset & 3) * 8);
#endif #endif
@ -9008,7 +9008,7 @@ static void switch_buffer_by_offset_8x4_be (u32x w0[4], u32x w1[4], u32x w2[4],
static void switch_buffer_by_offset_8x4_carry_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], u32x c0[4], u32x c1[4], u32x c2[4], u32x c3[4], u32x c4[4], u32x c5[4], u32x c6[4], u32x c7[4], const u32 offset) static void switch_buffer_by_offset_8x4_carry_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], u32x c0[4], u32x c1[4], u32x c2[4], u32x c3[4], u32x c4[4], u32x c5[4], u32x c6[4], u32x c7[4], const u32 offset)
{ {
#if defined IS_AMD_LEGACY || defined IS_GENERIC #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
switch (offset / 4) switch (offset / 4)
{ {
case 0: case 0:
@ -10693,13 +10693,13 @@ static void switch_buffer_by_offset_8x4_carry_be (u32x w0[4], u32x w1[4], u32x w
} }
#endif #endif
#if defined IS_AMD_ROCM || defined IS_NV #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
#if defined IS_NV #if defined IS_NV
const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
#endif #endif
#if defined IS_AMD_ROCM #if defined IS_AMD
const int selector = 0x0706050403020100 >> ((offset & 3) * 8); const int selector = 0x0706050403020100 >> ((offset & 3) * 8);
#endif #endif
@ -12394,7 +12394,7 @@ static void switch_buffer_by_offset_1x64_le (u32x w[64], const u32 offset)
const int offset_minus_4 = 4 - offset_mod_4; const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_AMD_LEGACY || defined IS_GENERIC #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
#pragma unroll #pragma unroll
for (int i = 0; i < 64; i++) w[i] = swap32 (w[i]); for (int i = 0; i < 64; i++) w[i] = swap32 (w[i]);
@ -16759,13 +16759,13 @@ static void switch_buffer_by_offset_1x64_le (u32x w[64], const u32 offset)
#endif #endif
#if defined IS_AMD_ROCM || defined IS_NV #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
#if defined IS_NV #if defined IS_NV
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
#endif #endif
#if defined IS_AMD_ROCM #if defined IS_AMD
const int selector = 0x0706050403020100 >> (offset_minus_4 * 8); const int selector = 0x0706050403020100 >> (offset_minus_4 * 8);
#endif #endif
@ -21128,7 +21128,7 @@ static void switch_buffer_by_offset_1x64_le (u32x w[64], const u32 offset)
static void switch_buffer_by_offset_1x64_be (u32x w[64], const u32 offset) static void switch_buffer_by_offset_1x64_be (u32x w[64], const u32 offset)
{ {
#if defined IS_AMD_LEGACY || defined IS_GENERIC #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
switch (offset / 4) switch (offset / 4)
{ {
case 0: case 0:
@ -25485,13 +25485,13 @@ static void switch_buffer_by_offset_1x64_be (u32x w[64], const u32 offset)
} }
#endif #endif
#if defined IS_AMD_ROCM || defined IS_NV #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
#if defined IS_NV #if defined IS_NV
const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
#endif #endif
#if defined IS_AMD_ROCM #if defined IS_AMD
const int selector = 0x0706050403020100 >> ((offset & 3) * 8); const int selector = 0x0706050403020100 >> ((offset & 3) * 8);
#endif #endif
@ -32287,7 +32287,7 @@ static void make_utf16be_S (const u32 in[4], u32 out1[4], u32 out2[4])
out1[1] = __byte_perm_S (in[0], 0, 0x3727); out1[1] = __byte_perm_S (in[0], 0, 0x3727);
out1[0] = __byte_perm_S (in[0], 0, 0x1707); out1[0] = __byte_perm_S (in[0], 0, 0x1707);
#elif defined IS_AMD_ROCM #elif defined IS_AMD && AMD_GCN >= 3
out2[3] = __byte_perm_S (in[3], 0, 0x03070207); out2[3] = __byte_perm_S (in[3], 0, 0x03070207);
out2[2] = __byte_perm_S (in[3], 0, 0x01070007); out2[2] = __byte_perm_S (in[3], 0, 0x01070007);
@ -32325,7 +32325,7 @@ static void make_utf16le_S (const u32 in[4], u32 out1[4], u32 out2[4])
out1[1] = __byte_perm_S (in[0], 0, 0x7372); out1[1] = __byte_perm_S (in[0], 0, 0x7372);
out1[0] = __byte_perm_S (in[0], 0, 0x7170); out1[0] = __byte_perm_S (in[0], 0, 0x7170);
#elif defined IS_AMD_ROCM #elif defined IS_AMD && AMD_GCN >= 3
out2[3] = __byte_perm_S (in[3], 0, 0x07030702); out2[3] = __byte_perm_S (in[3], 0, 0x07030702);
out2[2] = __byte_perm_S (in[3], 0, 0x07010700); out2[2] = __byte_perm_S (in[3], 0, 0x07010700);
@ -32359,7 +32359,7 @@ static void undo_utf16be_S (const u32 in1[4], const u32 in2[4], u32 out[4])
out[2] = __byte_perm_S (in2[0], in2[1], 0x4602); out[2] = __byte_perm_S (in2[0], in2[1], 0x4602);
out[3] = __byte_perm_S (in2[2], in2[3], 0x4602); out[3] = __byte_perm_S (in2[2], in2[3], 0x4602);
#elif defined IS_AMD_ROCM #elif defined IS_AMD && AMD_GCN >= 3
out[0] = __byte_perm_S (in1[0], in1[1], 0x04060002); out[0] = __byte_perm_S (in1[0], in1[1], 0x04060002);
out[1] = __byte_perm_S (in1[2], in1[3], 0x04060002); out[1] = __byte_perm_S (in1[2], in1[3], 0x04060002);
@ -32389,7 +32389,7 @@ static void undo_utf16le_S (const u32 in1[4], const u32 in2[4], u32 out[4])
out[2] = __byte_perm_S (in2[0], in2[1], 0x6420); out[2] = __byte_perm_S (in2[0], in2[1], 0x6420);
out[3] = __byte_perm_S (in2[2], in2[3], 0x6420); out[3] = __byte_perm_S (in2[2], in2[3], 0x6420);
#elif defined IS_AMD_ROCM #elif defined IS_AMD && AMD_GCN >= 3
out[0] = __byte_perm_S (in1[0], in1[1], 0x06040200); out[0] = __byte_perm_S (in1[0], in1[1], 0x06040200);
out[1] = __byte_perm_S (in1[2], in1[3], 0x06040200); out[1] = __byte_perm_S (in1[2], in1[3], 0x06040200);
@ -32416,7 +32416,7 @@ static void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w
const int offset_minus_4 = 4 - offset_mod_4; const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_AMD_LEGACY || defined IS_GENERIC #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
w0[0] = swap32_S (w0[0]); w0[0] = swap32_S (w0[0]);
w0[1] = swap32_S (w0[1]); w0[1] = swap32_S (w0[1]);
w0[2] = swap32_S (w0[2]); w0[2] = swap32_S (w0[2]);
@ -32775,13 +32775,13 @@ static void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w
w3[3] = swap32_S (w3[3]); w3[3] = swap32_S (w3[3]);
#endif #endif
#if defined IS_AMD_ROCM || defined IS_NV #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
#if defined IS_NV #if defined IS_NV
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
#endif #endif
#if defined IS_AMD_ROCM #if defined IS_AMD
const int selector = 0x0706050403020100 >> (offset_minus_4 * 8); const int selector = 0x0706050403020100 >> (offset_minus_4 * 8);
#endif #endif
@ -34428,7 +34428,7 @@ static void switch_buffer_by_offset_carry_le_S (u32 w0[4], u32 w1[4], u32 w2[4],
static void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) static void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset)
{ {
#if defined IS_AMD_LEGACY || defined IS_GENERIC #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
switch (offset / 4) switch (offset / 4)
{ {
case 0: case 0:
@ -34753,13 +34753,13 @@ static void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w
} }
#endif #endif
#if defined IS_AMD_ROCM || defined IS_NV #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
#if defined IS_NV #if defined IS_NV
const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
#endif #endif
#if defined IS_AMD_ROCM #if defined IS_AMD
const int selector = 0x0706050403020100 >> ((offset & 3) * 8); const int selector = 0x0706050403020100 >> ((offset & 3) * 8);
#endif #endif
@ -35090,7 +35090,7 @@ static void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w
static void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 c0[4], u32 c1[4], u32 c2[4], u32 c3[4], const u32 offset) static void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 c0[4], u32 c1[4], u32 c2[4], u32 c3[4], const u32 offset)
{ {
#if defined IS_AMD_LEGACY || defined IS_GENERIC #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
switch (offset / 4) switch (offset / 4)
{ {
case 0: case 0:
@ -35551,13 +35551,13 @@ static void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4],
} }
#endif #endif
#if defined IS_AMD_ROCM || defined IS_NV #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
#if defined IS_NV #if defined IS_NV
const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
#endif #endif
#if defined IS_AMD_ROCM #if defined IS_AMD
const int selector = 0x0706050403020100 >> ((offset & 3) * 8); const int selector = 0x0706050403020100 >> ((offset & 3) * 8);
#endif #endif
@ -36028,7 +36028,7 @@ static void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u
const int offset_minus_4 = 4 - offset_mod_4; const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_AMD_LEGACY || defined IS_GENERIC #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
w0[0] = swap32_S (w0[0]); w0[0] = swap32_S (w0[0]);
w0[1] = swap32_S (w0[1]); w0[1] = swap32_S (w0[1]);
w0[2] = swap32_S (w0[2]); w0[2] = swap32_S (w0[2]);
@ -37251,13 +37251,13 @@ static void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u
w7[3] = swap32_S (w7[3]); w7[3] = swap32_S (w7[3]);
#endif #endif
#if defined IS_AMD_ROCM || defined IS_NV #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
#if defined IS_NV #if defined IS_NV
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
#endif #endif
#if defined IS_AMD_ROCM #if defined IS_AMD
const int selector = 0x0706050403020100 >> (offset_minus_4 * 8); const int selector = 0x0706050403020100 >> (offset_minus_4 * 8);
#endif #endif
@ -37828,7 +37828,7 @@ static void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u
static void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset) static void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset)
{ {
#if defined IS_AMD_LEGACY || defined IS_GENERIC #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
switch (offset / 4) switch (offset / 4)
{ {
case 0: case 0:
@ -38985,13 +38985,13 @@ static void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u
} }
#endif #endif
#if defined IS_AMD_ROCM || defined IS_NV #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
#if defined IS_NV #if defined IS_NV
const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
#endif #endif
#if defined IS_AMD_ROCM #if defined IS_AMD
const int selector = 0x0706050403020100 >> ((offset & 3) * 8); const int selector = 0x0706050403020100 >> ((offset & 3) * 8);
#endif #endif
@ -40154,7 +40154,7 @@ static void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u
static void switch_buffer_by_offset_8x4_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], u32 c0[4], u32 c1[4], u32 c2[4], u32 c3[4], u32 c4[4], u32 c5[4], u32 c6[4], u32 c7[4], const u32 offset) static void switch_buffer_by_offset_8x4_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], u32 c0[4], u32 c1[4], u32 c2[4], u32 c3[4], u32 c4[4], u32 c5[4], u32 c6[4], u32 c7[4], const u32 offset)
{ {
#if defined IS_AMD_LEGACY || defined IS_GENERIC #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
switch (offset / 4) switch (offset / 4)
{ {
case 0: case 0:
@ -41839,13 +41839,13 @@ static void switch_buffer_by_offset_8x4_carry_be_S (u32 w0[4], u32 w1[4], u32 w2
} }
#endif #endif
#if defined IS_AMD_ROCM || defined IS_NV #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
#if defined IS_NV #if defined IS_NV
const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
#endif #endif
#if defined IS_AMD_ROCM #if defined IS_AMD
const int selector = 0x0706050403020100 >> ((offset & 3) * 8); const int selector = 0x0706050403020100 >> ((offset & 3) * 8);
#endif #endif
@ -43540,7 +43540,7 @@ static void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset)
const int offset_minus_4 = 4 - offset_mod_4; const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_AMD_LEGACY || defined IS_GENERIC #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
#pragma unroll #pragma unroll
for (int i = 0; i < 64; i++) w[i] = swap32_S (w[i]); for (int i = 0; i < 64; i++) w[i] = swap32_S (w[i]);
@ -47905,13 +47905,13 @@ static void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset)
#endif #endif
#if defined IS_AMD_ROCM || defined IS_NV #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
#if defined IS_NV #if defined IS_NV
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
#endif #endif
#if defined IS_AMD_ROCM #if defined IS_AMD
const int selector = 0x0706050403020100 >> (offset_minus_4 * 8); const int selector = 0x0706050403020100 >> (offset_minus_4 * 8);
#endif #endif
@ -52274,7 +52274,7 @@ static void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset)
static void switch_buffer_by_offset_1x64_be_S (u32 w[64], const u32 offset) static void switch_buffer_by_offset_1x64_be_S (u32 w[64], const u32 offset)
{ {
#if defined IS_AMD_LEGACY || defined IS_GENERIC #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
switch (offset / 4) switch (offset / 4)
{ {
case 0: case 0:
@ -56631,13 +56631,13 @@ static void switch_buffer_by_offset_1x64_be_S (u32 w[64], const u32 offset)
} }
#endif #endif
#if defined IS_AMD_ROCM || defined IS_NV #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
#if defined IS_NV #if defined IS_NV
const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
#endif #endif
#if defined IS_AMD_ROCM #if defined IS_AMD
const int selector = 0x0706050403020100 >> ((offset & 3) * 8); const int selector = 0x0706050403020100 >> ((offset & 3) * 8);
#endif #endif

View File

@ -767,7 +767,7 @@ static void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32
u32 s6 = 0; u32 s6 = 0;
u32 s7 = 0; u32 s7 = 0;
#if defined IS_AMD_LEGACY || defined IS_GENERIC #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
const u32 src_r00 = swap32_S (src_r0[0]); const u32 src_r00 = swap32_S (src_r0[0]);
const u32 src_r01 = swap32_S (src_r0[1]); const u32 src_r01 = swap32_S (src_r0[1]);
const u32 src_r02 = swap32_S (src_r0[2]); const u32 src_r02 = swap32_S (src_r0[2]);
@ -879,7 +879,7 @@ static void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32
s7 = swap32_S (s7); s7 = swap32_S (s7);
#endif #endif
#if defined IS_AMD_ROCM || defined IS_NV #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
const int offset_mod_4 = offset & 3; const int offset_mod_4 = offset & 3;
@ -889,7 +889,7 @@ static void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
#endif #endif
#if defined IS_AMD_ROCM #if defined IS_AMD
const int selector = 0x0706050403020100 >> (offset_minus_4 * 8); const int selector = 0x0706050403020100 >> (offset_minus_4 * 8);
#endif #endif

View File

@ -174,27 +174,19 @@ static u64x hl32_to_64 (const u32x a, const u32x b)
} }
#ifdef IS_AMD #ifdef IS_AMD
#if AMD_GCN >= 3
static u32 swap32_S (const u32 v) static u32 swap32_S (const u32 v)
{ {
#ifdef IS_AMD_ROCM
u32 r; u32 r;
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r) : "v"(v), "v"(0x00010203)); __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r) : "v"(v), "v"(0x00010203));
return r; return r;
#else
return as_uint (as_uchar4 (v).s3210);
#endif
} }
static u64 swap64_S (const u64 v) static u64 swap64_S (const u64 v)
{ {
#ifdef IS_AMD_ROCM
const u32 v0 = h32_from_64_S (v); const u32 v0 = h32_from_64_S (v);
const u32 v1 = l32_from_64_S (v); const u32 v1 = l32_from_64_S (v);
@ -207,13 +199,18 @@ static u64 swap64_S (const u64 v)
const u64 r = hl32_to_64_S (t1, t0); const u64 r = hl32_to_64_S (t1, t0);
return r; return r;
#else
return (as_ulong (as_uchar8 (v).s76543210));
#endif
} }
#else
static u32 swap32_S (const u32 v)
{
return as_uint (as_uchar4 (v).s3210);
}
static u64 swap64_S (const u64 v)
{
return (as_ulong (as_uchar8 (v).s76543210));
}
#endif
static u32 rotr32_S (const u32 a, const u32 n) static u32 rotr32_S (const u32 a, const u32 n)
{ {
@ -243,57 +240,14 @@ static u64 rotl64_S (const u64 a, const u32 n)
return rotr64_S (a, 64 - n); return rotr64_S (a, 64 - n);
} }
#if AMD_GCN >= 3
static u32x swap32 (const u32x v) static u32x swap32 (const u32x v)
{ {
#ifdef IS_AMD_ROCM
u32x r;
#if VECT_SIZE == 1
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r) : "v"(v), "v"(0x00010203));
#endif
#if VECT_SIZE >= 2
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.s0) : "v"(v.s0), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.s1) : "v"(v.s1), "v"(0x00010203));
#endif
#if VECT_SIZE >= 4
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.s2) : "v"(v.s2), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.s3) : "v"(v.s3), "v"(0x00010203));
#endif
#if VECT_SIZE >= 8
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.s4) : "v"(v.s4), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.s5) : "v"(v.s5), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.s6) : "v"(v.s6), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.s7) : "v"(v.s7), "v"(0x00010203));
#endif
#if VECT_SIZE >= 16
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.s8) : "v"(v.s8), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.s9) : "v"(v.s9), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.sa) : "v"(v.sa), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.sb) : "v"(v.sb), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.sc) : "v"(v.sc), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.sd) : "v"(v.sd), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.se) : "v"(v.se), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.sf) : "v"(v.sf), "v"(0x00010203));
#endif
return r;
#else
return bitselect (rotate (v, 24u), rotate (v, 8u), 0x00ff00ffu); return bitselect (rotate (v, 24u), rotate (v, 8u), 0x00ff00ffu);
#endif
} }
static u64x swap64 (const u64x v) static u64x swap64 (const u64x v)
{ {
#ifdef IS_AMD_ROCM
const u32x a0 = h32_from_64 (v); const u32x a0 = h32_from_64 (v);
const u32x a1 = l32_from_64 (v); const u32x a1 = l32_from_64 (v);
@ -352,16 +306,22 @@ static u64x swap64 (const u64x v)
const u64x r = hl32_to_64 (t1, t0); const u64x r = hl32_to_64 (t1, t0);
return r; return r;
}
#else
static u32x swap32 (const u32x v)
{
return bitselect (rotate (v, 24u), rotate (v, 8u), 0x00ff00ffu);
}
#else static u64x swap64 (const u64x v)
{
return bitselect (bitselect (rotate (v, 24ul), return bitselect (bitselect (rotate (v, 24ul),
rotate (v, 8ul), 0x000000ff000000fful), rotate (v, 8ul), 0x000000ff000000fful),
bitselect (rotate (v, 56ul), bitselect (rotate (v, 56ul),
rotate (v, 40ul), 0x00ff000000ff0000ul), rotate (v, 40ul), 0x00ff000000ff0000ul),
0xffff0000ffff0000ul); 0xffff0000ffff0000ul);
#endif
} }
#endif
static u32x rotr32 (const u32x a, const u32 n) static u32x rotr32 (const u32x a, const u32 n)
{ {
@ -406,7 +366,7 @@ static u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c)
return amd_bytealign (a, b, c); return amd_bytealign (a, b, c);
} }
#ifdef IS_AMD_ROCM #if AMD_GCN >= 3
static u32x __byte_perm (const u32x a, const u32x b, const u32x c) static u32x __byte_perm (const u32x a, const u32x b, const u32x c)
{ {
u32x r; u32x r;
@ -459,9 +419,7 @@ static u32x __byte_perm (const u32x a, const u32x b, const u32x c)
return r; return r;
} }
#endif
#ifdef IS_AMD_ROCM
static u32 __byte_perm_S (const u32 a, const u32 b, const u32 c) static u32 __byte_perm_S (const u32 a, const u32 b, const u32 c)
{ {
u32 r; u32 r;
@ -472,7 +430,7 @@ static u32 __byte_perm_S (const u32 a, const u32 b, const u32 c)
} }
#endif #endif
#ifdef IS_AMD_ROCM_VEGA #if AMD_GCN >= 5
static u32x __add3 (const u32x a, const u32x b, const u32x c) static u32x __add3 (const u32x a, const u32x b, const u32x c)
{ {
u32x r; u32x r;
@ -525,14 +483,7 @@ static u32x __add3 (const u32x a, const u32x b, const u32x c)
return r; return r;
} }
#else
static u32x __add3 (const u32x a, const u32x b, const u32x c)
{
return a + b + c;
}
#endif
#ifdef IS_AMD_ROCM_VEGA
static u32 __add3_S (const u32 a, const u32 b, const u32 c) static u32 __add3_S (const u32 a, const u32 b, const u32 c)
{ {
u32 r; u32 r;
@ -542,6 +493,11 @@ static u32 __add3_S (const u32 a, const u32 b, const u32 c)
return r; return r;
} }
#else #else
static u32x __add3 (const u32x a, const u32x b, const u32x c)
{
return a + b + c;
}
static u32 __add3_S (const u32 a, const u32 b, const u32 c) static u32 __add3_S (const u32 a, const u32 b, const u32 c)
{ {
return a + b + c; return a + b + c;

View File

@ -40,12 +40,20 @@
#if VENDOR_ID == (1 << 0) #if VENDOR_ID == (1 << 0)
#if AMD_ROCM == 0 #if AMD_ROCM == 0
#define IS_AMD #define IS_AMD
#define IS_AMD_LEGACY #define AMD_GCN 0
#else #else
#define IS_AMD #define IS_AMD
#define IS_AMD_ROCM #if defined __gfx600__ || defined __gfx601__
#if defined __gfx900__ || defined __gfx901__ || defined __gfx902__ || defined __gfx903__ #define AMD_GCN 1
#define IS_AMD_ROCM_VEGA #elif defined __gfx700__ || defined __gfx701__ || defined __gfx702__ || defined __gfx703__
#define AMD_GCN 2
#elif defined __gfx800__ || defined __gfx801__ || defined __gfx802__ || defined __gfx803__ || defined __gfx804__ || defined __gfx810__
#define AMD_GCN 3
#define AMD_GCN 4
#elif defined __gfx900__ || defined __gfx901__ || defined __gfx902__ || defined __gfx903__
#define AMD_GCN 5
#else
#define AMD_GCN 0
#endif #endif
#endif #endif
#elif VENDOR_ID == (1 << 1) #elif VENDOR_ID == (1 << 1)