Add fine-tuned AMD GCN control macros

pull/1358/head
jsteube 7 years ago
parent 3b89153c2d
commit ac9f1da747

@ -225,7 +225,7 @@ static void make_utf16be (const u32x in[4], u32x out1[4], u32x out2[4])
out1[1] = __byte_perm (in[0], 0, 0x3727);
out1[0] = __byte_perm (in[0], 0, 0x1707);
#elif defined IS_AMD_ROCM
#elif defined IS_AMD && AMD_GCN >= 3
out2[3] = __byte_perm (in[3], 0, 0x03070207);
out2[2] = __byte_perm (in[3], 0, 0x01070007);
@ -263,7 +263,7 @@ static void make_utf16beN (const u32x in[4], u32x out1[4], u32x out2[4])
out1[1] = __byte_perm (in[0], 0, 0x1707);
out1[0] = __byte_perm (in[0], 0, 0x3727);
#elif defined IS_AMD_ROCM
#elif defined IS_AMD && AMD_GCN >= 3
out2[3] = __byte_perm (in[3], 0, 0x01070007);
out2[2] = __byte_perm (in[3], 0, 0x03070207);
@ -301,7 +301,7 @@ static void make_utf16le (const u32x in[4], u32x out1[4], u32x out2[4])
out1[1] = __byte_perm (in[0], 0, 0x7372);
out1[0] = __byte_perm (in[0], 0, 0x7170);
#elif defined IS_AMD_ROCM
#elif defined IS_AMD && AMD_GCN >= 3
out2[3] = __byte_perm (in[3], 0, 0x07030702);
out2[2] = __byte_perm (in[3], 0, 0x07010700);
@ -339,7 +339,7 @@ static void make_utf16leN (const u32x in[4], u32x out1[4], u32x out2[4])
out1[1] = __byte_perm (in[0], 0, 0x7170);
out1[0] = __byte_perm (in[0], 0, 0x7372);
#elif defined IS_AMD_ROCM
#elif defined IS_AMD && AMD_GCN >= 3
out2[3] = __byte_perm (in[3], 0, 0x07010700);
out2[2] = __byte_perm (in[3], 0, 0x07030702);
@ -373,7 +373,7 @@ static void undo_utf16be (const u32x in1[4], const u32x in2[4], u32x out[4])
out[2] = __byte_perm (in2[0], in2[1], 0x4602);
out[3] = __byte_perm (in2[2], in2[3], 0x4602);
#elif defined IS_AMD_ROCM
#elif defined IS_AMD && AMD_GCN >= 3
out[0] = __byte_perm (in1[0], in1[1], 0x04060002);
out[1] = __byte_perm (in1[2], in1[3], 0x04060002);
@ -403,7 +403,7 @@ static void undo_utf16le (const u32x in1[4], const u32x in2[4], u32x out[4])
out[2] = __byte_perm (in2[0], in2[1], 0x6420);
out[3] = __byte_perm (in2[2], in2[3], 0x6420);
#elif defined IS_AMD_ROCM
#elif defined IS_AMD && AMD_GCN >= 3
out[0] = __byte_perm (in1[0], in1[1], 0x06040200);
out[1] = __byte_perm (in1[2], in1[3], 0x06040200);
@ -1266,7 +1266,7 @@ static void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x
const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_AMD_LEGACY || defined IS_GENERIC
#if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
w0[0] = swap32 (w0[0]);
w0[1] = swap32 (w0[1]);
w0[2] = swap32 (w0[2]);
@ -1625,13 +1625,13 @@ static void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x
w3[3] = swap32 (w3[3]);
#endif
#if defined IS_AMD_ROCM || defined IS_NV
#if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
#if defined IS_NV
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
#endif
#if defined IS_AMD_ROCM
#if defined IS_AMD
const int selector = 0x0706050403020100 >> (offset_minus_4 * 8);
#endif
@ -3279,7 +3279,7 @@ static void switch_buffer_by_offset_carry_le (u32x w0[4], u32x w1[4], u32x w2[4]
static void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
{
#if defined IS_AMD_LEGACY || defined IS_GENERIC
#if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
switch (offset / 4)
{
@ -3606,13 +3606,13 @@ static void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x
#endif
#if defined IS_AMD_ROCM || defined IS_NV
#if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
#if defined IS_NV
const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
#endif
#if defined IS_AMD_ROCM
#if defined IS_AMD
const int selector = 0x0706050403020100 >> ((offset & 3) * 8);
#endif
@ -3944,7 +3944,7 @@ static void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x
static void switch_buffer_by_offset_carry_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x c0[4], u32x c1[4], u32x c2[4], u32x c3[4], const u32 offset)
{
#if defined IS_AMD_LEGACY || defined IS_GENERIC
#if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
switch (offset / 4)
{
case 0:
@ -4405,13 +4405,13 @@ static void switch_buffer_by_offset_carry_be (u32x w0[4], u32x w1[4], u32x w2[4]
}
#endif
#if defined IS_AMD_ROCM || defined IS_NV
#if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
#if defined IS_NV
const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
#endif
#if defined IS_AMD_ROCM
#if defined IS_AMD
const int selector = 0x0706050403020100 >> ((offset & 3) * 8);
#endif
@ -4882,7 +4882,7 @@ static void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4],
const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_AMD_LEGACY || defined IS_GENERIC
#if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
w0[0] = swap32 (w0[0]);
w0[1] = swap32 (w0[1]);
w0[2] = swap32 (w0[2]);
@ -6105,13 +6105,13 @@ static void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4],
w7[3] = swap32 (w7[3]);
#endif
#if defined IS_AMD_ROCM || defined IS_NV
#if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
#if defined IS_NV
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
#endif
#if defined IS_AMD_ROCM
#if defined IS_AMD
const int selector = 0x0706050403020100 >> (offset_minus_4 * 8);
#endif
@ -6682,7 +6682,7 @@ static void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4],
static void switch_buffer_by_offset_8x4_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset)
{
#if defined IS_AMD_LEGACY || defined IS_GENERIC
#if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
switch (offset / 4)
{
case 0:
@ -7839,13 +7839,13 @@ static void switch_buffer_by_offset_8x4_be (u32x w0[4], u32x w1[4], u32x w2[4],
}
#endif
#if defined IS_AMD_ROCM || defined IS_NV
#if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
#if defined IS_NV
const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
#endif
#if defined IS_AMD_ROCM
#if defined IS_AMD
const int selector = 0x0706050403020100 >> ((offset & 3) * 8);
#endif
@ -9008,7 +9008,7 @@ static void switch_buffer_by_offset_8x4_be (u32x w0[4], u32x w1[4], u32x w2[4],
static void switch_buffer_by_offset_8x4_carry_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], u32x c0[4], u32x c1[4], u32x c2[4], u32x c3[4], u32x c4[4], u32x c5[4], u32x c6[4], u32x c7[4], const u32 offset)
{
#if defined IS_AMD_LEGACY || defined IS_GENERIC
#if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
switch (offset / 4)
{
case 0:
@ -10693,13 +10693,13 @@ static void switch_buffer_by_offset_8x4_carry_be (u32x w0[4], u32x w1[4], u32x w
}
#endif
#if defined IS_AMD_ROCM || defined IS_NV
#if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
#if defined IS_NV
const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
#endif
#if defined IS_AMD_ROCM
#if defined IS_AMD
const int selector = 0x0706050403020100 >> ((offset & 3) * 8);
#endif
@ -12394,7 +12394,7 @@ static void switch_buffer_by_offset_1x64_le (u32x w[64], const u32 offset)
const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_AMD_LEGACY || defined IS_GENERIC
#if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
#pragma unroll
for (int i = 0; i < 64; i++) w[i] = swap32 (w[i]);
@ -16759,13 +16759,13 @@ static void switch_buffer_by_offset_1x64_le (u32x w[64], const u32 offset)
#endif
#if defined IS_AMD_ROCM || defined IS_NV
#if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
#if defined IS_NV
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
#endif
#if defined IS_AMD_ROCM
#if defined IS_AMD
const int selector = 0x0706050403020100 >> (offset_minus_4 * 8);
#endif
@ -21128,7 +21128,7 @@ static void switch_buffer_by_offset_1x64_le (u32x w[64], const u32 offset)
static void switch_buffer_by_offset_1x64_be (u32x w[64], const u32 offset)
{
#if defined IS_AMD_LEGACY || defined IS_GENERIC
#if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
switch (offset / 4)
{
case 0:
@ -25485,13 +25485,13 @@ static void switch_buffer_by_offset_1x64_be (u32x w[64], const u32 offset)
}
#endif
#if defined IS_AMD_ROCM || defined IS_NV
#if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
#if defined IS_NV
const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
#endif
#if defined IS_AMD_ROCM
#if defined IS_AMD
const int selector = 0x0706050403020100 >> ((offset & 3) * 8);
#endif
@ -32287,7 +32287,7 @@ static void make_utf16be_S (const u32 in[4], u32 out1[4], u32 out2[4])
out1[1] = __byte_perm_S (in[0], 0, 0x3727);
out1[0] = __byte_perm_S (in[0], 0, 0x1707);
#elif defined IS_AMD_ROCM
#elif defined IS_AMD && AMD_GCN >= 3
out2[3] = __byte_perm_S (in[3], 0, 0x03070207);
out2[2] = __byte_perm_S (in[3], 0, 0x01070007);
@ -32325,7 +32325,7 @@ static void make_utf16le_S (const u32 in[4], u32 out1[4], u32 out2[4])
out1[1] = __byte_perm_S (in[0], 0, 0x7372);
out1[0] = __byte_perm_S (in[0], 0, 0x7170);
#elif defined IS_AMD_ROCM
#elif defined IS_AMD && AMD_GCN >= 3
out2[3] = __byte_perm_S (in[3], 0, 0x07030702);
out2[2] = __byte_perm_S (in[3], 0, 0x07010700);
@ -32359,7 +32359,7 @@ static void undo_utf16be_S (const u32 in1[4], const u32 in2[4], u32 out[4])
out[2] = __byte_perm_S (in2[0], in2[1], 0x4602);
out[3] = __byte_perm_S (in2[2], in2[3], 0x4602);
#elif defined IS_AMD_ROCM
#elif defined IS_AMD && AMD_GCN >= 3
out[0] = __byte_perm_S (in1[0], in1[1], 0x04060002);
out[1] = __byte_perm_S (in1[2], in1[3], 0x04060002);
@ -32389,7 +32389,7 @@ static void undo_utf16le_S (const u32 in1[4], const u32 in2[4], u32 out[4])
out[2] = __byte_perm_S (in2[0], in2[1], 0x6420);
out[3] = __byte_perm_S (in2[2], in2[3], 0x6420);
#elif defined IS_AMD_ROCM
#elif defined IS_AMD && AMD_GCN >= 3
out[0] = __byte_perm_S (in1[0], in1[1], 0x06040200);
out[1] = __byte_perm_S (in1[2], in1[3], 0x06040200);
@ -32416,7 +32416,7 @@ static void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w
const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_AMD_LEGACY || defined IS_GENERIC
#if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
w0[0] = swap32_S (w0[0]);
w0[1] = swap32_S (w0[1]);
w0[2] = swap32_S (w0[2]);
@ -32775,13 +32775,13 @@ static void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w
w3[3] = swap32_S (w3[3]);
#endif
#if defined IS_AMD_ROCM || defined IS_NV
#if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
#if defined IS_NV
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
#endif
#if defined IS_AMD_ROCM
#if defined IS_AMD
const int selector = 0x0706050403020100 >> (offset_minus_4 * 8);
#endif
@ -34428,7 +34428,7 @@ static void switch_buffer_by_offset_carry_le_S (u32 w0[4], u32 w1[4], u32 w2[4],
static void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset)
{
#if defined IS_AMD_LEGACY || defined IS_GENERIC
#if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
switch (offset / 4)
{
case 0:
@ -34753,13 +34753,13 @@ static void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w
}
#endif
#if defined IS_AMD_ROCM || defined IS_NV
#if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
#if defined IS_NV
const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
#endif
#if defined IS_AMD_ROCM
#if defined IS_AMD
const int selector = 0x0706050403020100 >> ((offset & 3) * 8);
#endif
@ -35090,7 +35090,7 @@ static void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w
static void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 c0[4], u32 c1[4], u32 c2[4], u32 c3[4], const u32 offset)
{
#if defined IS_AMD_LEGACY || defined IS_GENERIC
#if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
switch (offset / 4)
{
case 0:
@ -35551,13 +35551,13 @@ static void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4],
}
#endif
#if defined IS_AMD_ROCM || defined IS_NV
#if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
#if defined IS_NV
const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
#endif
#if defined IS_AMD_ROCM
#if defined IS_AMD
const int selector = 0x0706050403020100 >> ((offset & 3) * 8);
#endif
@ -36028,7 +36028,7 @@ static void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u
const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_AMD_LEGACY || defined IS_GENERIC
#if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
w0[0] = swap32_S (w0[0]);
w0[1] = swap32_S (w0[1]);
w0[2] = swap32_S (w0[2]);
@ -37251,13 +37251,13 @@ static void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u
w7[3] = swap32_S (w7[3]);
#endif
#if defined IS_AMD_ROCM || defined IS_NV
#if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
#if defined IS_NV
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
#endif
#if defined IS_AMD_ROCM
#if defined IS_AMD
const int selector = 0x0706050403020100 >> (offset_minus_4 * 8);
#endif
@ -37828,7 +37828,7 @@ static void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u
static void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset)
{
#if defined IS_AMD_LEGACY || defined IS_GENERIC
#if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
switch (offset / 4)
{
case 0:
@ -38985,13 +38985,13 @@ static void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u
}
#endif
#if defined IS_AMD_ROCM || defined IS_NV
#if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
#if defined IS_NV
const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
#endif
#if defined IS_AMD_ROCM
#if defined IS_AMD
const int selector = 0x0706050403020100 >> ((offset & 3) * 8);
#endif
@ -40154,7 +40154,7 @@ static void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u
static void switch_buffer_by_offset_8x4_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], u32 c0[4], u32 c1[4], u32 c2[4], u32 c3[4], u32 c4[4], u32 c5[4], u32 c6[4], u32 c7[4], const u32 offset)
{
#if defined IS_AMD_LEGACY || defined IS_GENERIC
#if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
switch (offset / 4)
{
case 0:
@ -41839,13 +41839,13 @@ static void switch_buffer_by_offset_8x4_carry_be_S (u32 w0[4], u32 w1[4], u32 w2
}
#endif
#if defined IS_AMD_ROCM || defined IS_NV
#if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
#if defined IS_NV
const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
#endif
#if defined IS_AMD_ROCM
#if defined IS_AMD
const int selector = 0x0706050403020100 >> ((offset & 3) * 8);
#endif
@ -43540,7 +43540,7 @@ static void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset)
const int offset_minus_4 = 4 - offset_mod_4;
#if defined IS_AMD_LEGACY || defined IS_GENERIC
#if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
#pragma unroll
for (int i = 0; i < 64; i++) w[i] = swap32_S (w[i]);
@ -47905,13 +47905,13 @@ static void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset)
#endif
#if defined IS_AMD_ROCM || defined IS_NV
#if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
#if defined IS_NV
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
#endif
#if defined IS_AMD_ROCM
#if defined IS_AMD
const int selector = 0x0706050403020100 >> (offset_minus_4 * 8);
#endif
@ -52274,7 +52274,7 @@ static void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset)
static void switch_buffer_by_offset_1x64_be_S (u32 w[64], const u32 offset)
{
#if defined IS_AMD_LEGACY || defined IS_GENERIC
#if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
switch (offset / 4)
{
case 0:
@ -56631,13 +56631,13 @@ static void switch_buffer_by_offset_1x64_be_S (u32 w[64], const u32 offset)
}
#endif
#if defined IS_AMD_ROCM || defined IS_NV
#if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
#if defined IS_NV
const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
#endif
#if defined IS_AMD_ROCM
#if defined IS_AMD
const int selector = 0x0706050403020100 >> ((offset & 3) * 8);
#endif

@ -767,7 +767,7 @@ static void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32
u32 s6 = 0;
u32 s7 = 0;
#if defined IS_AMD_LEGACY || defined IS_GENERIC
#if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
const u32 src_r00 = swap32_S (src_r0[0]);
const u32 src_r01 = swap32_S (src_r0[1]);
const u32 src_r02 = swap32_S (src_r0[2]);
@ -879,7 +879,7 @@ static void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32
s7 = swap32_S (s7);
#endif
#if defined IS_AMD_ROCM || defined IS_NV
#if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
const int offset_mod_4 = offset & 3;
@ -889,7 +889,7 @@ static void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
#endif
#if defined IS_AMD_ROCM
#if defined IS_AMD
const int selector = 0x0706050403020100 >> (offset_minus_4 * 8);
#endif

@ -174,27 +174,19 @@ static u64x hl32_to_64 (const u32x a, const u32x b)
}
#ifdef IS_AMD
#if AMD_GCN >= 3
static u32 swap32_S (const u32 v)
{
#ifdef IS_AMD_ROCM
u32 r;
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r) : "v"(v), "v"(0x00010203));
return r;
#else
return as_uint (as_uchar4 (v).s3210);
#endif
}
static u64 swap64_S (const u64 v)
{
#ifdef IS_AMD_ROCM
const u32 v0 = h32_from_64_S (v);
const u32 v1 = l32_from_64_S (v);
@ -207,13 +199,18 @@ static u64 swap64_S (const u64 v)
const u64 r = hl32_to_64_S (t1, t0);
return r;
}
#else
static u32 swap32_S (const u32 v)
{
return as_uint (as_uchar4 (v).s3210);
}
#else
static u64 swap64_S (const u64 v)
{
return (as_ulong (as_uchar8 (v).s76543210));
#endif
}
#endif
static u32 rotr32_S (const u32 a, const u32 n)
{
@ -243,57 +240,14 @@ static u64 rotl64_S (const u64 a, const u32 n)
return rotr64_S (a, 64 - n);
}
#if AMD_GCN >= 3
static u32x swap32 (const u32x v)
{
#ifdef IS_AMD_ROCM
u32x r;
#if VECT_SIZE == 1
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r) : "v"(v), "v"(0x00010203));
#endif
#if VECT_SIZE >= 2
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.s0) : "v"(v.s0), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.s1) : "v"(v.s1), "v"(0x00010203));
#endif
#if VECT_SIZE >= 4
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.s2) : "v"(v.s2), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.s3) : "v"(v.s3), "v"(0x00010203));
#endif
#if VECT_SIZE >= 8
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.s4) : "v"(v.s4), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.s5) : "v"(v.s5), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.s6) : "v"(v.s6), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.s7) : "v"(v.s7), "v"(0x00010203));
#endif
#if VECT_SIZE >= 16
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.s8) : "v"(v.s8), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.s9) : "v"(v.s9), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.sa) : "v"(v.sa), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.sb) : "v"(v.sb), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.sc) : "v"(v.sc), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.sd) : "v"(v.sd), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.se) : "v"(v.se), "v"(0x00010203));
__asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.sf) : "v"(v.sf), "v"(0x00010203));
#endif
return r;
#else
return bitselect (rotate (v, 24u), rotate (v, 8u), 0x00ff00ffu);
#endif
}
static u64x swap64 (const u64x v)
{
#ifdef IS_AMD_ROCM
const u32x a0 = h32_from_64 (v);
const u32x a1 = l32_from_64 (v);
@ -352,16 +306,22 @@ static u64x swap64 (const u64x v)
const u64x r = hl32_to_64 (t1, t0);
return r;
}
#else
static u32x swap32 (const u32x v)
{
return bitselect (rotate (v, 24u), rotate (v, 8u), 0x00ff00ffu);
}
#else
static u64x swap64 (const u64x v)
{
return bitselect (bitselect (rotate (v, 24ul),
rotate (v, 8ul), 0x000000ff000000fful),
bitselect (rotate (v, 56ul),
rotate (v, 40ul), 0x00ff000000ff0000ul),
0xffff0000ffff0000ul);
#endif
}
#endif
static u32x rotr32 (const u32x a, const u32 n)
{
@ -406,7 +366,7 @@ static u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c)
return amd_bytealign (a, b, c);
}
#ifdef IS_AMD_ROCM
#if AMD_GCN >= 3
static u32x __byte_perm (const u32x a, const u32x b, const u32x c)
{
u32x r;
@ -459,9 +419,7 @@ static u32x __byte_perm (const u32x a, const u32x b, const u32x c)
return r;
}
#endif
#ifdef IS_AMD_ROCM
static u32 __byte_perm_S (const u32 a, const u32 b, const u32 c)
{
u32 r;
@ -472,7 +430,7 @@ static u32 __byte_perm_S (const u32 a, const u32 b, const u32 c)
}
#endif
#ifdef IS_AMD_ROCM_VEGA
#if AMD_GCN >= 5
static u32x __add3 (const u32x a, const u32x b, const u32x c)
{
u32x r;
@ -525,14 +483,7 @@ static u32x __add3 (const u32x a, const u32x b, const u32x c)
return r;
}
#else
static u32x __add3 (const u32x a, const u32x b, const u32x c)
{
return a + b + c;
}
#endif
#ifdef IS_AMD_ROCM_VEGA
static u32 __add3_S (const u32 a, const u32 b, const u32 c)
{
u32 r;
@ -542,6 +493,11 @@ static u32 __add3_S (const u32 a, const u32 b, const u32 c)
return r;
}
#else
static u32x __add3 (const u32x a, const u32x b, const u32x c)
{
return a + b + c;
}
static u32 __add3_S (const u32 a, const u32 b, const u32 c)
{
return a + b + c;

@ -40,12 +40,20 @@
#if VENDOR_ID == (1 << 0)
#if AMD_ROCM == 0
#define IS_AMD
#define IS_AMD_LEGACY
#define AMD_GCN 0
#else
#define IS_AMD
#define IS_AMD_ROCM
#if defined __gfx900__ || defined __gfx901__ || defined __gfx902__ || defined __gfx903__
#define IS_AMD_ROCM_VEGA
#if defined __gfx600__ || defined __gfx601__
#define AMD_GCN 1
#elif defined __gfx700__ || defined __gfx701__ || defined __gfx702__ || defined __gfx703__
#define AMD_GCN 2
#elif defined __gfx800__ || defined __gfx801__ || defined __gfx802__ || defined __gfx803__ || defined __gfx804__ || defined __gfx810__
#define AMD_GCN 3
#define AMD_GCN 4
#elif defined __gfx900__ || defined __gfx901__ || defined __gfx902__ || defined __gfx903__
#define AMD_GCN 5
#else
#define AMD_GCN 0
#endif
#endif
#elif VENDOR_ID == (1 << 1)

Loading…
Cancel
Save