|
|
|
@ -225,7 +225,7 @@ static void make_utf16be (const u32x in[4], u32x out1[4], u32x out2[4])
|
|
|
|
|
out1[1] = __byte_perm (in[0], 0, 0x3727);
|
|
|
|
|
out1[0] = __byte_perm (in[0], 0, 0x1707);
|
|
|
|
|
|
|
|
|
|
#elif defined IS_AMD_ROCM
|
|
|
|
|
#elif defined IS_AMD && AMD_GCN >= 3
|
|
|
|
|
|
|
|
|
|
out2[3] = __byte_perm (in[3], 0, 0x03070207);
|
|
|
|
|
out2[2] = __byte_perm (in[3], 0, 0x01070007);
|
|
|
|
@ -263,7 +263,7 @@ static void make_utf16beN (const u32x in[4], u32x out1[4], u32x out2[4])
|
|
|
|
|
out1[1] = __byte_perm (in[0], 0, 0x1707);
|
|
|
|
|
out1[0] = __byte_perm (in[0], 0, 0x3727);
|
|
|
|
|
|
|
|
|
|
#elif defined IS_AMD_ROCM
|
|
|
|
|
#elif defined IS_AMD && AMD_GCN >= 3
|
|
|
|
|
|
|
|
|
|
out2[3] = __byte_perm (in[3], 0, 0x01070007);
|
|
|
|
|
out2[2] = __byte_perm (in[3], 0, 0x03070207);
|
|
|
|
@ -301,7 +301,7 @@ static void make_utf16le (const u32x in[4], u32x out1[4], u32x out2[4])
|
|
|
|
|
out1[1] = __byte_perm (in[0], 0, 0x7372);
|
|
|
|
|
out1[0] = __byte_perm (in[0], 0, 0x7170);
|
|
|
|
|
|
|
|
|
|
#elif defined IS_AMD_ROCM
|
|
|
|
|
#elif defined IS_AMD && AMD_GCN >= 3
|
|
|
|
|
|
|
|
|
|
out2[3] = __byte_perm (in[3], 0, 0x07030702);
|
|
|
|
|
out2[2] = __byte_perm (in[3], 0, 0x07010700);
|
|
|
|
@ -339,7 +339,7 @@ static void make_utf16leN (const u32x in[4], u32x out1[4], u32x out2[4])
|
|
|
|
|
out1[1] = __byte_perm (in[0], 0, 0x7170);
|
|
|
|
|
out1[0] = __byte_perm (in[0], 0, 0x7372);
|
|
|
|
|
|
|
|
|
|
#elif defined IS_AMD_ROCM
|
|
|
|
|
#elif defined IS_AMD && AMD_GCN >= 3
|
|
|
|
|
|
|
|
|
|
out2[3] = __byte_perm (in[3], 0, 0x07010700);
|
|
|
|
|
out2[2] = __byte_perm (in[3], 0, 0x07030702);
|
|
|
|
@ -373,7 +373,7 @@ static void undo_utf16be (const u32x in1[4], const u32x in2[4], u32x out[4])
|
|
|
|
|
out[2] = __byte_perm (in2[0], in2[1], 0x4602);
|
|
|
|
|
out[3] = __byte_perm (in2[2], in2[3], 0x4602);
|
|
|
|
|
|
|
|
|
|
#elif defined IS_AMD_ROCM
|
|
|
|
|
#elif defined IS_AMD && AMD_GCN >= 3
|
|
|
|
|
|
|
|
|
|
out[0] = __byte_perm (in1[0], in1[1], 0x04060002);
|
|
|
|
|
out[1] = __byte_perm (in1[2], in1[3], 0x04060002);
|
|
|
|
@ -403,7 +403,7 @@ static void undo_utf16le (const u32x in1[4], const u32x in2[4], u32x out[4])
|
|
|
|
|
out[2] = __byte_perm (in2[0], in2[1], 0x6420);
|
|
|
|
|
out[3] = __byte_perm (in2[2], in2[3], 0x6420);
|
|
|
|
|
|
|
|
|
|
#elif defined IS_AMD_ROCM
|
|
|
|
|
#elif defined IS_AMD && AMD_GCN >= 3
|
|
|
|
|
|
|
|
|
|
out[0] = __byte_perm (in1[0], in1[1], 0x06040200);
|
|
|
|
|
out[1] = __byte_perm (in1[2], in1[3], 0x06040200);
|
|
|
|
@ -1266,7 +1266,7 @@ static void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x
|
|
|
|
|
|
|
|
|
|
const int offset_minus_4 = 4 - offset_mod_4;
|
|
|
|
|
|
|
|
|
|
#if defined IS_AMD_LEGACY || defined IS_GENERIC
|
|
|
|
|
#if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
|
|
|
|
|
w0[0] = swap32 (w0[0]);
|
|
|
|
|
w0[1] = swap32 (w0[1]);
|
|
|
|
|
w0[2] = swap32 (w0[2]);
|
|
|
|
@ -1625,13 +1625,13 @@ static void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x
|
|
|
|
|
w3[3] = swap32 (w3[3]);
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#if defined IS_AMD_ROCM || defined IS_NV
|
|
|
|
|
#if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
|
|
|
|
|
|
|
|
|
|
#if defined IS_NV
|
|
|
|
|
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#if defined IS_AMD_ROCM
|
|
|
|
|
#if defined IS_AMD
|
|
|
|
|
const int selector = 0x0706050403020100 >> (offset_minus_4 * 8);
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
@ -3279,7 +3279,7 @@ static void switch_buffer_by_offset_carry_le (u32x w0[4], u32x w1[4], u32x w2[4]
|
|
|
|
|
|
|
|
|
|
static void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset)
|
|
|
|
|
{
|
|
|
|
|
#if defined IS_AMD_LEGACY || defined IS_GENERIC
|
|
|
|
|
#if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
|
|
|
|
|
|
|
|
|
|
switch (offset / 4)
|
|
|
|
|
{
|
|
|
|
@ -3606,13 +3606,13 @@ static void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x
|
|
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#if defined IS_AMD_ROCM || defined IS_NV
|
|
|
|
|
#if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
|
|
|
|
|
|
|
|
|
|
#if defined IS_NV
|
|
|
|
|
const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#if defined IS_AMD_ROCM
|
|
|
|
|
#if defined IS_AMD
|
|
|
|
|
const int selector = 0x0706050403020100 >> ((offset & 3) * 8);
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
@ -3944,7 +3944,7 @@ static void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x
|
|
|
|
|
|
|
|
|
|
static void switch_buffer_by_offset_carry_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x c0[4], u32x c1[4], u32x c2[4], u32x c3[4], const u32 offset)
|
|
|
|
|
{
|
|
|
|
|
#if defined IS_AMD_LEGACY || defined IS_GENERIC
|
|
|
|
|
#if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
|
|
|
|
|
switch (offset / 4)
|
|
|
|
|
{
|
|
|
|
|
case 0:
|
|
|
|
@ -4405,13 +4405,13 @@ static void switch_buffer_by_offset_carry_be (u32x w0[4], u32x w1[4], u32x w2[4]
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#if defined IS_AMD_ROCM || defined IS_NV
|
|
|
|
|
#if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
|
|
|
|
|
|
|
|
|
|
#if defined IS_NV
|
|
|
|
|
const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#if defined IS_AMD_ROCM
|
|
|
|
|
#if defined IS_AMD
|
|
|
|
|
const int selector = 0x0706050403020100 >> ((offset & 3) * 8);
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
@ -4882,7 +4882,7 @@ static void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4],
|
|
|
|
|
|
|
|
|
|
const int offset_minus_4 = 4 - offset_mod_4;
|
|
|
|
|
|
|
|
|
|
#if defined IS_AMD_LEGACY || defined IS_GENERIC
|
|
|
|
|
#if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
|
|
|
|
|
w0[0] = swap32 (w0[0]);
|
|
|
|
|
w0[1] = swap32 (w0[1]);
|
|
|
|
|
w0[2] = swap32 (w0[2]);
|
|
|
|
@ -6105,13 +6105,13 @@ static void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4],
|
|
|
|
|
w7[3] = swap32 (w7[3]);
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#if defined IS_AMD_ROCM || defined IS_NV
|
|
|
|
|
#if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
|
|
|
|
|
|
|
|
|
|
#if defined IS_NV
|
|
|
|
|
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#if defined IS_AMD_ROCM
|
|
|
|
|
#if defined IS_AMD
|
|
|
|
|
const int selector = 0x0706050403020100 >> (offset_minus_4 * 8);
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
@ -6682,7 +6682,7 @@ static void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4],
|
|
|
|
|
|
|
|
|
|
static void switch_buffer_by_offset_8x4_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset)
|
|
|
|
|
{
|
|
|
|
|
#if defined IS_AMD_LEGACY || defined IS_GENERIC
|
|
|
|
|
#if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
|
|
|
|
|
switch (offset / 4)
|
|
|
|
|
{
|
|
|
|
|
case 0:
|
|
|
|
@ -7839,13 +7839,13 @@ static void switch_buffer_by_offset_8x4_be (u32x w0[4], u32x w1[4], u32x w2[4],
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#if defined IS_AMD_ROCM || defined IS_NV
|
|
|
|
|
#if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
|
|
|
|
|
|
|
|
|
|
#if defined IS_NV
|
|
|
|
|
const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#if defined IS_AMD_ROCM
|
|
|
|
|
#if defined IS_AMD
|
|
|
|
|
const int selector = 0x0706050403020100 >> ((offset & 3) * 8);
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
@ -9008,7 +9008,7 @@ static void switch_buffer_by_offset_8x4_be (u32x w0[4], u32x w1[4], u32x w2[4],
|
|
|
|
|
|
|
|
|
|
static void switch_buffer_by_offset_8x4_carry_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], u32x c0[4], u32x c1[4], u32x c2[4], u32x c3[4], u32x c4[4], u32x c5[4], u32x c6[4], u32x c7[4], const u32 offset)
|
|
|
|
|
{
|
|
|
|
|
#if defined IS_AMD_LEGACY || defined IS_GENERIC
|
|
|
|
|
#if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
|
|
|
|
|
switch (offset / 4)
|
|
|
|
|
{
|
|
|
|
|
case 0:
|
|
|
|
@ -10693,13 +10693,13 @@ static void switch_buffer_by_offset_8x4_carry_be (u32x w0[4], u32x w1[4], u32x w
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#if defined IS_AMD_ROCM || defined IS_NV
|
|
|
|
|
#if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
|
|
|
|
|
|
|
|
|
|
#if defined IS_NV
|
|
|
|
|
const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#if defined IS_AMD_ROCM
|
|
|
|
|
#if defined IS_AMD
|
|
|
|
|
const int selector = 0x0706050403020100 >> ((offset & 3) * 8);
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
@ -12394,7 +12394,7 @@ static void switch_buffer_by_offset_1x64_le (u32x w[64], const u32 offset)
|
|
|
|
|
|
|
|
|
|
const int offset_minus_4 = 4 - offset_mod_4;
|
|
|
|
|
|
|
|
|
|
#if defined IS_AMD_LEGACY || defined IS_GENERIC
|
|
|
|
|
#if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
|
|
|
|
|
|
|
|
|
|
#pragma unroll
|
|
|
|
|
for (int i = 0; i < 64; i++) w[i] = swap32 (w[i]);
|
|
|
|
@ -16759,13 +16759,13 @@ static void switch_buffer_by_offset_1x64_le (u32x w[64], const u32 offset)
|
|
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#if defined IS_AMD_ROCM || defined IS_NV
|
|
|
|
|
#if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
|
|
|
|
|
|
|
|
|
|
#if defined IS_NV
|
|
|
|
|
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#if defined IS_AMD_ROCM
|
|
|
|
|
#if defined IS_AMD
|
|
|
|
|
const int selector = 0x0706050403020100 >> (offset_minus_4 * 8);
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
@ -21128,7 +21128,7 @@ static void switch_buffer_by_offset_1x64_le (u32x w[64], const u32 offset)
|
|
|
|
|
|
|
|
|
|
static void switch_buffer_by_offset_1x64_be (u32x w[64], const u32 offset)
|
|
|
|
|
{
|
|
|
|
|
#if defined IS_AMD_LEGACY || defined IS_GENERIC
|
|
|
|
|
#if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
|
|
|
|
|
switch (offset / 4)
|
|
|
|
|
{
|
|
|
|
|
case 0:
|
|
|
|
@ -25485,13 +25485,13 @@ static void switch_buffer_by_offset_1x64_be (u32x w[64], const u32 offset)
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#if defined IS_AMD_ROCM || defined IS_NV
|
|
|
|
|
#if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
|
|
|
|
|
|
|
|
|
|
#if defined IS_NV
|
|
|
|
|
const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#if defined IS_AMD_ROCM
|
|
|
|
|
#if defined IS_AMD
|
|
|
|
|
const int selector = 0x0706050403020100 >> ((offset & 3) * 8);
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
@ -32287,7 +32287,7 @@ static void make_utf16be_S (const u32 in[4], u32 out1[4], u32 out2[4])
|
|
|
|
|
out1[1] = __byte_perm_S (in[0], 0, 0x3727);
|
|
|
|
|
out1[0] = __byte_perm_S (in[0], 0, 0x1707);
|
|
|
|
|
|
|
|
|
|
#elif defined IS_AMD_ROCM
|
|
|
|
|
#elif defined IS_AMD && AMD_GCN >= 3
|
|
|
|
|
|
|
|
|
|
out2[3] = __byte_perm_S (in[3], 0, 0x03070207);
|
|
|
|
|
out2[2] = __byte_perm_S (in[3], 0, 0x01070007);
|
|
|
|
@ -32325,7 +32325,7 @@ static void make_utf16le_S (const u32 in[4], u32 out1[4], u32 out2[4])
|
|
|
|
|
out1[1] = __byte_perm_S (in[0], 0, 0x7372);
|
|
|
|
|
out1[0] = __byte_perm_S (in[0], 0, 0x7170);
|
|
|
|
|
|
|
|
|
|
#elif defined IS_AMD_ROCM
|
|
|
|
|
#elif defined IS_AMD && AMD_GCN >= 3
|
|
|
|
|
|
|
|
|
|
out2[3] = __byte_perm_S (in[3], 0, 0x07030702);
|
|
|
|
|
out2[2] = __byte_perm_S (in[3], 0, 0x07010700);
|
|
|
|
@ -32359,7 +32359,7 @@ static void undo_utf16be_S (const u32 in1[4], const u32 in2[4], u32 out[4])
|
|
|
|
|
out[2] = __byte_perm_S (in2[0], in2[1], 0x4602);
|
|
|
|
|
out[3] = __byte_perm_S (in2[2], in2[3], 0x4602);
|
|
|
|
|
|
|
|
|
|
#elif defined IS_AMD_ROCM
|
|
|
|
|
#elif defined IS_AMD && AMD_GCN >= 3
|
|
|
|
|
|
|
|
|
|
out[0] = __byte_perm_S (in1[0], in1[1], 0x04060002);
|
|
|
|
|
out[1] = __byte_perm_S (in1[2], in1[3], 0x04060002);
|
|
|
|
@ -32389,7 +32389,7 @@ static void undo_utf16le_S (const u32 in1[4], const u32 in2[4], u32 out[4])
|
|
|
|
|
out[2] = __byte_perm_S (in2[0], in2[1], 0x6420);
|
|
|
|
|
out[3] = __byte_perm_S (in2[2], in2[3], 0x6420);
|
|
|
|
|
|
|
|
|
|
#elif defined IS_AMD_ROCM
|
|
|
|
|
#elif defined IS_AMD && AMD_GCN >= 3
|
|
|
|
|
|
|
|
|
|
out[0] = __byte_perm_S (in1[0], in1[1], 0x06040200);
|
|
|
|
|
out[1] = __byte_perm_S (in1[2], in1[3], 0x06040200);
|
|
|
|
@ -32416,7 +32416,7 @@ static void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w
|
|
|
|
|
|
|
|
|
|
const int offset_minus_4 = 4 - offset_mod_4;
|
|
|
|
|
|
|
|
|
|
#if defined IS_AMD_LEGACY || defined IS_GENERIC
|
|
|
|
|
#if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
|
|
|
|
|
w0[0] = swap32_S (w0[0]);
|
|
|
|
|
w0[1] = swap32_S (w0[1]);
|
|
|
|
|
w0[2] = swap32_S (w0[2]);
|
|
|
|
@ -32775,13 +32775,13 @@ static void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w
|
|
|
|
|
w3[3] = swap32_S (w3[3]);
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#if defined IS_AMD_ROCM || defined IS_NV
|
|
|
|
|
#if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
|
|
|
|
|
|
|
|
|
|
#if defined IS_NV
|
|
|
|
|
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#if defined IS_AMD_ROCM
|
|
|
|
|
#if defined IS_AMD
|
|
|
|
|
const int selector = 0x0706050403020100 >> (offset_minus_4 * 8);
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
@ -34428,7 +34428,7 @@ static void switch_buffer_by_offset_carry_le_S (u32 w0[4], u32 w1[4], u32 w2[4],
|
|
|
|
|
|
|
|
|
|
static void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset)
|
|
|
|
|
{
|
|
|
|
|
#if defined IS_AMD_LEGACY || defined IS_GENERIC
|
|
|
|
|
#if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
|
|
|
|
|
switch (offset / 4)
|
|
|
|
|
{
|
|
|
|
|
case 0:
|
|
|
|
@ -34753,13 +34753,13 @@ static void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#if defined IS_AMD_ROCM || defined IS_NV
|
|
|
|
|
#if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
|
|
|
|
|
|
|
|
|
|
#if defined IS_NV
|
|
|
|
|
const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#if defined IS_AMD_ROCM
|
|
|
|
|
#if defined IS_AMD
|
|
|
|
|
const int selector = 0x0706050403020100 >> ((offset & 3) * 8);
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
@ -35090,7 +35090,7 @@ static void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w
|
|
|
|
|
|
|
|
|
|
static void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 c0[4], u32 c1[4], u32 c2[4], u32 c3[4], const u32 offset)
|
|
|
|
|
{
|
|
|
|
|
#if defined IS_AMD_LEGACY || defined IS_GENERIC
|
|
|
|
|
#if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
|
|
|
|
|
switch (offset / 4)
|
|
|
|
|
{
|
|
|
|
|
case 0:
|
|
|
|
@ -35551,13 +35551,13 @@ static void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4],
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#if defined IS_AMD_ROCM || defined IS_NV
|
|
|
|
|
#if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
|
|
|
|
|
|
|
|
|
|
#if defined IS_NV
|
|
|
|
|
const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#if defined IS_AMD_ROCM
|
|
|
|
|
#if defined IS_AMD
|
|
|
|
|
const int selector = 0x0706050403020100 >> ((offset & 3) * 8);
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
@ -36028,7 +36028,7 @@ static void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u
|
|
|
|
|
|
|
|
|
|
const int offset_minus_4 = 4 - offset_mod_4;
|
|
|
|
|
|
|
|
|
|
#if defined IS_AMD_LEGACY || defined IS_GENERIC
|
|
|
|
|
#if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
|
|
|
|
|
w0[0] = swap32_S (w0[0]);
|
|
|
|
|
w0[1] = swap32_S (w0[1]);
|
|
|
|
|
w0[2] = swap32_S (w0[2]);
|
|
|
|
@ -37251,13 +37251,13 @@ static void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u
|
|
|
|
|
w7[3] = swap32_S (w7[3]);
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#if defined IS_AMD_ROCM || defined IS_NV
|
|
|
|
|
#if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
|
|
|
|
|
|
|
|
|
|
#if defined IS_NV
|
|
|
|
|
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#if defined IS_AMD_ROCM
|
|
|
|
|
#if defined IS_AMD
|
|
|
|
|
const int selector = 0x0706050403020100 >> (offset_minus_4 * 8);
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
@ -37828,7 +37828,7 @@ static void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u
|
|
|
|
|
|
|
|
|
|
static void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset)
|
|
|
|
|
{
|
|
|
|
|
#if defined IS_AMD_LEGACY || defined IS_GENERIC
|
|
|
|
|
#if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
|
|
|
|
|
switch (offset / 4)
|
|
|
|
|
{
|
|
|
|
|
case 0:
|
|
|
|
@ -38985,13 +38985,13 @@ static void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#if defined IS_AMD_ROCM || defined IS_NV
|
|
|
|
|
#if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
|
|
|
|
|
|
|
|
|
|
#if defined IS_NV
|
|
|
|
|
const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#if defined IS_AMD_ROCM
|
|
|
|
|
#if defined IS_AMD
|
|
|
|
|
const int selector = 0x0706050403020100 >> ((offset & 3) * 8);
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
@ -40154,7 +40154,7 @@ static void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u
|
|
|
|
|
|
|
|
|
|
static void switch_buffer_by_offset_8x4_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], u32 c0[4], u32 c1[4], u32 c2[4], u32 c3[4], u32 c4[4], u32 c5[4], u32 c6[4], u32 c7[4], const u32 offset)
|
|
|
|
|
{
|
|
|
|
|
#if defined IS_AMD_LEGACY || defined IS_GENERIC
|
|
|
|
|
#if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
|
|
|
|
|
switch (offset / 4)
|
|
|
|
|
{
|
|
|
|
|
case 0:
|
|
|
|
@ -41839,13 +41839,13 @@ static void switch_buffer_by_offset_8x4_carry_be_S (u32 w0[4], u32 w1[4], u32 w2
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#if defined IS_AMD_ROCM || defined IS_NV
|
|
|
|
|
#if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
|
|
|
|
|
|
|
|
|
|
#if defined IS_NV
|
|
|
|
|
const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#if defined IS_AMD_ROCM
|
|
|
|
|
#if defined IS_AMD
|
|
|
|
|
const int selector = 0x0706050403020100 >> ((offset & 3) * 8);
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
@ -43540,7 +43540,7 @@ static void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset)
|
|
|
|
|
|
|
|
|
|
const int offset_minus_4 = 4 - offset_mod_4;
|
|
|
|
|
|
|
|
|
|
#if defined IS_AMD_LEGACY || defined IS_GENERIC
|
|
|
|
|
#if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
|
|
|
|
|
|
|
|
|
|
#pragma unroll
|
|
|
|
|
for (int i = 0; i < 64; i++) w[i] = swap32_S (w[i]);
|
|
|
|
@ -47905,13 +47905,13 @@ static void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset)
|
|
|
|
|
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#if defined IS_AMD_ROCM || defined IS_NV
|
|
|
|
|
#if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
|
|
|
|
|
|
|
|
|
|
#if defined IS_NV
|
|
|
|
|
const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff;
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#if defined IS_AMD_ROCM
|
|
|
|
|
#if defined IS_AMD
|
|
|
|
|
const int selector = 0x0706050403020100 >> (offset_minus_4 * 8);
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
@ -52274,7 +52274,7 @@ static void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset)
|
|
|
|
|
|
|
|
|
|
static void switch_buffer_by_offset_1x64_be_S (u32 w[64], const u32 offset)
|
|
|
|
|
{
|
|
|
|
|
#if defined IS_AMD_LEGACY || defined IS_GENERIC
|
|
|
|
|
#if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC
|
|
|
|
|
switch (offset / 4)
|
|
|
|
|
{
|
|
|
|
|
case 0:
|
|
|
|
@ -56631,13 +56631,13 @@ static void switch_buffer_by_offset_1x64_be_S (u32 w[64], const u32 offset)
|
|
|
|
|
}
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#if defined IS_AMD_ROCM || defined IS_NV
|
|
|
|
|
#if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV
|
|
|
|
|
|
|
|
|
|
#if defined IS_NV
|
|
|
|
|
const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff;
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#if defined IS_AMD_ROCM
|
|
|
|
|
#if defined IS_AMD
|
|
|
|
|
const int selector = 0x0706050403020100 >> ((offset & 3) * 8);
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|