diff --git a/OpenCL/inc_common.cl b/OpenCL/inc_common.cl index 715f170d9..41b784ada 100644 --- a/OpenCL/inc_common.cl +++ b/OpenCL/inc_common.cl @@ -225,7 +225,7 @@ static void make_utf16be (const u32x in[4], u32x out1[4], u32x out2[4]) out1[1] = __byte_perm (in[0], 0, 0x3727); out1[0] = __byte_perm (in[0], 0, 0x1707); - #elif defined IS_AMD_ROCM + #elif defined IS_AMD && AMD_GCN >= 3 out2[3] = __byte_perm (in[3], 0, 0x03070207); out2[2] = __byte_perm (in[3], 0, 0x01070007); @@ -263,7 +263,7 @@ static void make_utf16beN (const u32x in[4], u32x out1[4], u32x out2[4]) out1[1] = __byte_perm (in[0], 0, 0x1707); out1[0] = __byte_perm (in[0], 0, 0x3727); - #elif defined IS_AMD_ROCM + #elif defined IS_AMD && AMD_GCN >= 3 out2[3] = __byte_perm (in[3], 0, 0x01070007); out2[2] = __byte_perm (in[3], 0, 0x03070207); @@ -301,7 +301,7 @@ static void make_utf16le (const u32x in[4], u32x out1[4], u32x out2[4]) out1[1] = __byte_perm (in[0], 0, 0x7372); out1[0] = __byte_perm (in[0], 0, 0x7170); - #elif defined IS_AMD_ROCM + #elif defined IS_AMD && AMD_GCN >= 3 out2[3] = __byte_perm (in[3], 0, 0x07030702); out2[2] = __byte_perm (in[3], 0, 0x07010700); @@ -339,7 +339,7 @@ static void make_utf16leN (const u32x in[4], u32x out1[4], u32x out2[4]) out1[1] = __byte_perm (in[0], 0, 0x7170); out1[0] = __byte_perm (in[0], 0, 0x7372); - #elif defined IS_AMD_ROCM + #elif defined IS_AMD && AMD_GCN >= 3 out2[3] = __byte_perm (in[3], 0, 0x07010700); out2[2] = __byte_perm (in[3], 0, 0x07030702); @@ -373,7 +373,7 @@ static void undo_utf16be (const u32x in1[4], const u32x in2[4], u32x out[4]) out[2] = __byte_perm (in2[0], in2[1], 0x4602); out[3] = __byte_perm (in2[2], in2[3], 0x4602); - #elif defined IS_AMD_ROCM + #elif defined IS_AMD && AMD_GCN >= 3 out[0] = __byte_perm (in1[0], in1[1], 0x04060002); out[1] = __byte_perm (in1[2], in1[3], 0x04060002); @@ -403,7 +403,7 @@ static void undo_utf16le (const u32x in1[4], const u32x in2[4], u32x out[4]) out[2] = __byte_perm (in2[0], in2[1], 0x6420); out[3] = __byte_perm (in2[2], in2[3], 0x6420); - #elif defined IS_AMD_ROCM + #elif defined IS_AMD && AMD_GCN >= 3 out[0] = __byte_perm (in1[0], in1[1], 0x06040200); out[1] = __byte_perm (in1[2], in1[3], 0x06040200); @@ -1266,7 +1266,7 @@ static void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x const int offset_minus_4 = 4 - offset_mod_4; - #if defined IS_AMD_LEGACY || defined IS_GENERIC + #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC w0[0] = swap32 (w0[0]); w0[1] = swap32 (w0[1]); w0[2] = swap32 (w0[2]); @@ -1625,13 +1625,13 @@ static void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[3] = swap32 (w3[3]); #endif - #if defined IS_AMD_ROCM || defined IS_NV + #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV #if defined IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; #endif - #if defined IS_AMD_ROCM + #if defined IS_AMD const int selector = 0x0706050403020100 >> (offset_minus_4 * 8); #endif @@ -3279,7 +3279,7 @@ static void switch_buffer_by_offset_carry_le (u32x w0[4], u32x w1[4], u32x w2[4] static void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset) { - #if defined IS_AMD_LEGACY || defined IS_GENERIC + #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC switch (offset / 4) { @@ -3606,13 +3606,13 @@ static void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x #endif - #if defined IS_AMD_ROCM || defined IS_NV + #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV #if defined IS_NV const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; #endif - #if defined IS_AMD_ROCM + #if defined IS_AMD const int selector = 0x0706050403020100 >> ((offset & 3) * 8); #endif @@ -3944,7 +3944,7 @@ static void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x static void switch_buffer_by_offset_carry_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x c0[4], u32x c1[4], u32x c2[4], u32x c3[4], const u32 offset) { - #if defined IS_AMD_LEGACY || defined IS_GENERIC + #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC switch (offset / 4) { case 0: @@ -4405,13 +4405,13 @@ static void switch_buffer_by_offset_carry_be (u32x w0[4], u32x w1[4], u32x w2[4] } #endif - #if defined IS_AMD_ROCM || defined IS_NV + #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV #if defined IS_NV const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; #endif - #if defined IS_AMD_ROCM + #if defined IS_AMD const int selector = 0x0706050403020100 >> ((offset & 3) * 8); #endif @@ -4882,7 +4882,7 @@ static void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4], const int offset_minus_4 = 4 - offset_mod_4; - #if defined IS_AMD_LEGACY || defined IS_GENERIC + #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC w0[0] = swap32 (w0[0]); w0[1] = swap32 (w0[1]); w0[2] = swap32 (w0[2]); @@ -6105,13 +6105,13 @@ static void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4], w7[3] = swap32 (w7[3]); #endif - #if defined IS_AMD_ROCM || defined IS_NV + #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV #if defined IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; #endif - #if defined IS_AMD_ROCM + #if defined IS_AMD const int selector = 0x0706050403020100 >> (offset_minus_4 * 8); #endif @@ -6682,7 +6682,7 @@ static void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4], static void switch_buffer_by_offset_8x4_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset) { - #if defined IS_AMD_LEGACY || defined IS_GENERIC + #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC switch (offset / 4) { case 0: @@ -7839,13 +7839,13 @@ static void switch_buffer_by_offset_8x4_be (u32x w0[4], u32x w1[4], u32x w2[4], } #endif - #if defined IS_AMD_ROCM || defined IS_NV + #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV #if defined IS_NV const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; #endif - #if defined IS_AMD_ROCM + #if defined IS_AMD const int selector = 0x0706050403020100 >> ((offset & 3) * 8); #endif @@ -9008,7 +9008,7 @@ static void switch_buffer_by_offset_8x4_be (u32x w0[4], u32x w1[4], u32x w2[4], static void switch_buffer_by_offset_8x4_carry_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], u32x c0[4], u32x c1[4], u32x c2[4], u32x c3[4], u32x c4[4], u32x c5[4], u32x c6[4], u32x c7[4], const u32 offset) { - #if defined IS_AMD_LEGACY || defined IS_GENERIC + #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC switch (offset / 4) { case 0: @@ -10693,13 +10693,13 @@ static void switch_buffer_by_offset_8x4_carry_be (u32x w0[4], u32x w1[4], u32x w } #endif - #if defined IS_AMD_ROCM || defined IS_NV + #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV #if defined IS_NV const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; #endif - #if defined IS_AMD_ROCM + #if defined IS_AMD const int selector = 0x0706050403020100 >> ((offset & 3) * 8); #endif @@ -12394,7 +12394,7 @@ static void switch_buffer_by_offset_1x64_le (u32x w[64], const u32 offset) const int offset_minus_4 = 4 - offset_mod_4; - #if defined IS_AMD_LEGACY || defined IS_GENERIC + #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC #pragma unroll for (int i = 0; i < 64; i++) w[i] = swap32 (w[i]); @@ -16759,13 +16759,13 @@ static void switch_buffer_by_offset_1x64_le (u32x w[64], const u32 offset) #endif - #if defined IS_AMD_ROCM || defined IS_NV + #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV #if defined IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; #endif - #if defined IS_AMD_ROCM + #if defined IS_AMD const int selector = 0x0706050403020100 >> (offset_minus_4 * 8); #endif @@ -21128,7 +21128,7 @@ static void switch_buffer_by_offset_1x64_le (u32x w[64], const u32 offset) static void switch_buffer_by_offset_1x64_be (u32x w[64], const u32 offset) { - #if defined IS_AMD_LEGACY || defined IS_GENERIC + #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC switch (offset / 4) { case 0: @@ -25485,13 +25485,13 @@ static void switch_buffer_by_offset_1x64_be (u32x w[64], const u32 offset) } #endif - #if defined IS_AMD_ROCM || defined IS_NV + #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV #if defined IS_NV const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; #endif - #if defined IS_AMD_ROCM + #if defined IS_AMD const int selector = 0x0706050403020100 >> ((offset & 3) * 8); #endif @@ -32287,7 +32287,7 @@ static void make_utf16be_S (const u32 in[4], u32 out1[4], u32 out2[4]) out1[1] = __byte_perm_S (in[0], 0, 0x3727); out1[0] = __byte_perm_S (in[0], 0, 0x1707); - #elif defined IS_AMD_ROCM + #elif defined IS_AMD && AMD_GCN >= 3 out2[3] = __byte_perm_S (in[3], 0, 0x03070207); out2[2] = __byte_perm_S (in[3], 0, 0x01070007); @@ -32325,7 +32325,7 @@ static void make_utf16le_S (const u32 in[4], u32 out1[4], u32 out2[4]) out1[1] = __byte_perm_S (in[0], 0, 0x7372); out1[0] = __byte_perm_S (in[0], 0, 0x7170); - #elif defined IS_AMD_ROCM + #elif defined IS_AMD && AMD_GCN >= 3 out2[3] = __byte_perm_S (in[3], 0, 0x07030702); out2[2] = __byte_perm_S (in[3], 0, 0x07010700); @@ -32359,7 +32359,7 @@ static void undo_utf16be_S (const u32 in1[4], const u32 in2[4], u32 out[4]) out[2] = __byte_perm_S (in2[0], in2[1], 0x4602); out[3] = __byte_perm_S (in2[2], in2[3], 0x4602); - #elif defined IS_AMD_ROCM + #elif defined IS_AMD && AMD_GCN >= 3 out[0] = __byte_perm_S (in1[0], in1[1], 0x04060002); out[1] = __byte_perm_S (in1[2], in1[3], 0x04060002); @@ -32389,7 +32389,7 @@ static void undo_utf16le_S (const u32 in1[4], const u32 in2[4], u32 out[4]) out[2] = __byte_perm_S (in2[0], in2[1], 0x6420); out[3] = __byte_perm_S (in2[2], in2[3], 0x6420); - #elif defined IS_AMD_ROCM + #elif defined IS_AMD && AMD_GCN >= 3 out[0] = __byte_perm_S (in1[0], in1[1], 0x06040200); out[1] = __byte_perm_S (in1[2], in1[3], 0x06040200); @@ -32416,7 +32416,7 @@ static void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w const int offset_minus_4 = 4 - offset_mod_4; - #if defined IS_AMD_LEGACY || defined IS_GENERIC + #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC w0[0] = swap32_S (w0[0]); w0[1] = swap32_S (w0[1]); w0[2] = swap32_S (w0[2]); @@ -32775,13 +32775,13 @@ static void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w w3[3] = swap32_S (w3[3]); #endif - #if defined IS_AMD_ROCM || defined IS_NV + #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV #if defined IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; #endif - #if defined IS_AMD_ROCM + #if defined IS_AMD const int selector = 0x0706050403020100 >> (offset_minus_4 * 8); #endif @@ -34428,7 +34428,7 @@ static void switch_buffer_by_offset_carry_le_S (u32 w0[4], u32 w1[4], u32 w2[4], static void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) { - #if defined IS_AMD_LEGACY || defined IS_GENERIC + #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC switch (offset / 4) { case 0: @@ -34753,13 +34753,13 @@ static void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w } #endif - #if defined IS_AMD_ROCM || defined IS_NV + #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV #if defined IS_NV const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; #endif - #if defined IS_AMD_ROCM + #if defined IS_AMD const int selector = 0x0706050403020100 >> ((offset & 3) * 8); #endif @@ -35090,7 +35090,7 @@ static void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w static void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 c0[4], u32 c1[4], u32 c2[4], u32 c3[4], const u32 offset) { - #if defined IS_AMD_LEGACY || defined IS_GENERIC + #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC switch (offset / 4) { case 0: @@ -35551,13 +35551,13 @@ static void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], } #endif - #if defined IS_AMD_ROCM || defined IS_NV + #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV #if defined IS_NV const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; #endif - #if defined IS_AMD_ROCM + #if defined IS_AMD const int selector = 0x0706050403020100 >> ((offset & 3) * 8); #endif @@ -36028,7 +36028,7 @@ static void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u const int offset_minus_4 = 4 - offset_mod_4; - #if defined IS_AMD_LEGACY || defined IS_GENERIC + #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC w0[0] = swap32_S (w0[0]); w0[1] = swap32_S (w0[1]); w0[2] = swap32_S (w0[2]); @@ -37251,13 +37251,13 @@ static void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u w7[3] = swap32_S (w7[3]); #endif - #if defined IS_AMD_ROCM || defined IS_NV + #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV #if defined IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; #endif - #if defined IS_AMD_ROCM + #if defined IS_AMD const int selector = 0x0706050403020100 >> (offset_minus_4 * 8); #endif @@ -37828,7 +37828,7 @@ static void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u static void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset) { - #if defined IS_AMD_LEGACY || defined IS_GENERIC + #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC switch (offset / 4) { case 0: @@ -38985,13 +38985,13 @@ static void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u } #endif - #if defined IS_AMD_ROCM || defined IS_NV + #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV #if defined IS_NV const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; #endif - #if defined IS_AMD_ROCM + #if defined IS_AMD const int selector = 0x0706050403020100 >> ((offset & 3) * 8); #endif @@ -40154,7 +40154,7 @@ static void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u static void switch_buffer_by_offset_8x4_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], u32 c0[4], u32 c1[4], u32 c2[4], u32 c3[4], u32 c4[4], u32 c5[4], u32 c6[4], u32 c7[4], const u32 offset) { - #if defined IS_AMD_LEGACY || defined IS_GENERIC + #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC switch (offset / 4) { case 0: @@ -41839,13 +41839,13 @@ static void switch_buffer_by_offset_8x4_carry_be_S (u32 w0[4], u32 w1[4], u32 w2 } #endif - #if defined IS_AMD_ROCM || defined IS_NV + #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV #if defined IS_NV const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; #endif - #if defined IS_AMD_ROCM + #if defined IS_AMD const int selector = 0x0706050403020100 >> ((offset & 3) * 8); #endif @@ -43540,7 +43540,7 @@ static void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) const int offset_minus_4 = 4 - offset_mod_4; - #if defined IS_AMD_LEGACY || defined IS_GENERIC + #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC #pragma unroll for (int i = 0; i < 64; i++) w[i] = swap32_S (w[i]); @@ -47905,13 +47905,13 @@ static void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) #endif - #if defined IS_AMD_ROCM || defined IS_NV + #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV #if defined IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; #endif - #if defined IS_AMD_ROCM + #if defined IS_AMD const int selector = 0x0706050403020100 >> (offset_minus_4 * 8); #endif @@ -52274,7 +52274,7 @@ static void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) static void switch_buffer_by_offset_1x64_be_S (u32 w[64], const u32 offset) { - #if defined IS_AMD_LEGACY || defined IS_GENERIC + #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC switch (offset / 4) { case 0: @@ -56631,13 +56631,13 @@ static void switch_buffer_by_offset_1x64_be_S (u32 w[64], const u32 offset) } #endif - #if defined IS_AMD_ROCM || defined IS_NV + #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV #if defined IS_NV const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; #endif - #if defined IS_AMD_ROCM + #if defined IS_AMD const int selector = 0x0706050403020100 >> ((offset & 3) * 8); #endif diff --git a/OpenCL/inc_rp_optimized.cl b/OpenCL/inc_rp_optimized.cl index 808795cac..0cb0831e7 100644 --- a/OpenCL/inc_rp_optimized.cl +++ b/OpenCL/inc_rp_optimized.cl @@ -767,7 +767,7 @@ static void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 u32 s6 = 0; u32 s7 = 0; - #if defined IS_AMD_LEGACY || defined IS_GENERIC + #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC const u32 src_r00 = swap32_S (src_r0[0]); const u32 src_r01 = swap32_S (src_r0[1]); const u32 src_r02 = swap32_S (src_r0[2]); @@ -879,7 +879,7 @@ static void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 s7 = swap32_S (s7); #endif - #if defined IS_AMD_ROCM || defined IS_NV + #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV const int offset_mod_4 = offset & 3; @@ -889,7 +889,7 @@ static void append_block8 (const u32 offset, u32 buf0[4], u32 buf1[4], const u32 const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; #endif - #if defined IS_AMD_ROCM + #if defined IS_AMD const int selector = 0x0706050403020100 >> (offset_minus_4 * 8); #endif diff --git a/OpenCL/inc_types.cl b/OpenCL/inc_types.cl index ba4a53c66..ee5cf31b6 100644 --- a/OpenCL/inc_types.cl +++ b/OpenCL/inc_types.cl @@ -174,27 +174,19 @@ static u64x hl32_to_64 (const u32x a, const u32x b) } #ifdef IS_AMD + +#if AMD_GCN >= 3 static u32 swap32_S (const u32 v) { - #ifdef IS_AMD_ROCM - u32 r; __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r) : "v"(v), "v"(0x00010203)); return r; - - #else - - return as_uint (as_uchar4 (v).s3210); - - #endif } static u64 swap64_S (const u64 v) { - #ifdef IS_AMD_ROCM - const u32 v0 = h32_from_64_S (v); const u32 v1 = l32_from_64_S (v); @@ -207,13 +199,18 @@ static u64 swap64_S (const u64 v) const u64 r = hl32_to_64_S (t1, t0); return r; - - #else - - return (as_ulong (as_uchar8 (v).s76543210)); - - #endif } +#else +static u32 swap32_S (const u32 v) +{ + return as_uint (as_uchar4 (v).s3210); +} + +static u64 swap64_S (const u64 v) +{ + return (as_ulong (as_uchar8 (v).s76543210)); +} +#endif static u32 rotr32_S (const u32 a, const u32 n) { @@ -243,57 +240,14 @@ static u64 rotl64_S (const u64 a, const u32 n) return rotr64_S (a, 64 - n); } +#if AMD_GCN >= 3 static u32x swap32 (const u32x v) { - #ifdef IS_AMD_ROCM - - u32x r; - - #if VECT_SIZE == 1 - __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r) : "v"(v), "v"(0x00010203)); - #endif - - #if VECT_SIZE >= 2 - __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.s0) : "v"(v.s0), "v"(0x00010203)); - __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.s1) : "v"(v.s1), "v"(0x00010203)); - #endif - - #if VECT_SIZE >= 4 - __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.s2) : "v"(v.s2), "v"(0x00010203)); - __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.s3) : "v"(v.s3), "v"(0x00010203)); - #endif - - #if VECT_SIZE >= 8 - __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.s4) : "v"(v.s4), "v"(0x00010203)); - __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.s5) : "v"(v.s5), "v"(0x00010203)); - __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.s6) : "v"(v.s6), "v"(0x00010203)); - __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.s7) : "v"(v.s7), "v"(0x00010203)); - #endif - - #if VECT_SIZE >= 16 - __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.s8) : "v"(v.s8), "v"(0x00010203)); - __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.s9) : "v"(v.s9), "v"(0x00010203)); - __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.sa) : "v"(v.sa), "v"(0x00010203)); - __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.sb) : "v"(v.sb), "v"(0x00010203)); - __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.sc) : "v"(v.sc), "v"(0x00010203)); - __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.sd) : "v"(v.sd), "v"(0x00010203)); - __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.se) : "v"(v.se), "v"(0x00010203)); - __asm__ volatile ("V_PERM_B32 %0, 0, %1, %2;" : "=v"(r.sf) : "v"(v.sf), "v"(0x00010203)); - #endif - - return r; - - #else - return bitselect (rotate (v, 24u), rotate (v, 8u), 0x00ff00ffu); - - #endif } static u64x swap64 (const u64x v) { - #ifdef IS_AMD_ROCM - const u32x a0 = h32_from_64 (v); const u32x a1 = l32_from_64 (v); @@ -352,16 +306,22 @@ static u64x swap64 (const u64x v) const u64x r = hl32_to_64 (t1, t0); return r; +} +#else +static u32x swap32 (const u32x v) +{ + return bitselect (rotate (v, 24u), rotate (v, 8u), 0x00ff00ffu); +} - #else - +static u64x swap64 (const u64x v) +{ return bitselect (bitselect (rotate (v, 24ul), rotate (v, 8ul), 0x000000ff000000fful), bitselect (rotate (v, 56ul), rotate (v, 40ul), 0x00ff000000ff0000ul), 0xffff0000ffff0000ul); - #endif } +#endif static u32x rotr32 (const u32x a, const u32 n) { @@ -406,7 +366,7 @@ static u32 amd_bytealign_S (const u32 a, const u32 b, const u32 c) return amd_bytealign (a, b, c); } -#ifdef IS_AMD_ROCM +#if AMD_GCN >= 3 static u32x __byte_perm (const u32x a, const u32x b, const u32x c) { u32x r; @@ -459,9 +419,7 @@ static u32x __byte_perm (const u32x a, const u32x b, const u32x c) return r; } -#endif -#ifdef IS_AMD_ROCM static u32 __byte_perm_S (const u32 a, const u32 b, const u32 c) { u32 r; @@ -472,7 +430,7 @@ static u32 __byte_perm_S (const u32 a, const u32 b, const u32 c) } #endif -#ifdef IS_AMD_ROCM_VEGA +#if AMD_GCN >= 5 static u32x __add3 (const u32x a, const u32x b, const u32x c) { u32x r; @@ -525,14 +483,7 @@ static u32x __add3 (const u32x a, const u32x b, const u32x c) return r; } -#else -static u32x __add3 (const u32x a, const u32x b, const u32x c) -{ - return a + b + c; -} -#endif -#ifdef IS_AMD_ROCM_VEGA static u32 __add3_S (const u32 a, const u32 b, const u32 c) { u32 r; @@ -542,6 +493,11 @@ static u32 __add3_S (const u32 a, const u32 b, const u32 c) return r; } #else +static u32x __add3 (const u32x a, const u32x b, const u32x c) +{ + return a + b + c; +} + static u32 __add3_S (const u32 a, const u32 b, const u32 c) { return a + b + c; diff --git a/OpenCL/inc_vendor.cl b/OpenCL/inc_vendor.cl index a580740e1..792fd0411 100644 --- a/OpenCL/inc_vendor.cl +++ b/OpenCL/inc_vendor.cl @@ -40,12 +40,20 @@ #if VENDOR_ID == (1 << 0) #if AMD_ROCM == 0 #define IS_AMD -#define IS_AMD_LEGACY +#define AMD_GCN 0 #else #define IS_AMD -#define IS_AMD_ROCM -#if defined __gfx900__ || defined __gfx901__ || defined __gfx902__ || defined __gfx903__ -#define IS_AMD_ROCM_VEGA +#if defined __gfx600__ || defined __gfx601__ +#define AMD_GCN 1 +#elif defined __gfx700__ || defined __gfx701__ || defined __gfx702__ || defined __gfx703__ +#define AMD_GCN 2 +#elif defined __gfx800__ || defined __gfx801__ || defined __gfx802__ || defined __gfx803__ || defined __gfx804__ || defined __gfx810__ +#define AMD_GCN 3 +#define AMD_GCN 4 +#elif defined __gfx900__ || defined __gfx901__ || defined __gfx902__ || defined __gfx903__ +#define AMD_GCN 5 +#else +#define AMD_GCN 0 #endif #endif #elif VENDOR_ID == (1 << 1)