diff --git a/OpenCL/inc_common.cl b/OpenCL/inc_common.cl index 8e4886ca4..6c163280c 100644 --- a/OpenCL/inc_common.cl +++ b/OpenCL/inc_common.cl @@ -9042,9 +9042,10 @@ inline void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 c0[4], u32 c1[4], u32 c2[4], u32 c3[4], const u32 offset) { + #if defined IS_AMD || defined IS_GENERIC switch (offset / 4) { - case 0: + case 0: c0[0] = amd_bytealign_S (w3[3], 0, offset); w3[3] = amd_bytealign_S (w3[2], w3[3], offset); w3[2] = amd_bytealign_S (w3[1], w3[2], offset); @@ -9062,9 +9063,10 @@ inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], w0[2] = amd_bytealign_S (w0[1], w0[2], offset); w0[1] = amd_bytealign_S (w0[0], w0[1], offset); w0[0] = amd_bytealign_S ( 0, w0[0], offset); + break; - case 1: + case 1: c0[1] = amd_bytealign_S (w3[3], 0, offset); c0[0] = amd_bytealign_S (w3[2], w3[3], offset); w3[3] = amd_bytealign_S (w3[1], w3[2], offset); @@ -9083,9 +9085,10 @@ inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], w0[2] = amd_bytealign_S (w0[0], w0[1], offset); w0[1] = amd_bytealign_S ( 0, w0[0], offset); w0[0] = 0; + break; - case 2: + case 2: c0[2] = amd_bytealign_S (w3[3], 0, offset); c0[1] = amd_bytealign_S (w3[2], w3[3], offset); c0[0] = amd_bytealign_S (w3[1], w3[2], offset); @@ -9105,9 +9108,10 @@ inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], w0[2] = amd_bytealign_S ( 0, w0[0], offset); w0[1] = 0; w0[0] = 0; + break; - case 3: + case 3: c0[3] = amd_bytealign_S (w3[3], 0, offset); c0[2] = amd_bytealign_S (w3[2], w3[3], offset); c0[1] = amd_bytealign_S (w3[1], w3[2], offset); @@ -9128,9 +9132,10 @@ inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 4: + case 4: c1[0] = amd_bytealign_S (w3[3], 0, offset); c0[3] = amd_bytealign_S (w3[2], w3[3], offset); c0[2] = amd_bytealign_S (w3[1], w3[2], offset); @@ -9152,9 +9157,10 @@ inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 5: + case 5: c1[1] = amd_bytealign_S (w3[3], 0, offset); c1[0] = amd_bytealign_S (w3[2], w3[3], offset); c0[3] = amd_bytealign_S (w3[1], w3[2], offset); @@ -9177,9 +9183,10 @@ inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 6: + case 6: c1[2] = amd_bytealign_S (w3[3], 0, offset); c1[1] = amd_bytealign_S (w3[2], w3[3], offset); c1[0] = amd_bytealign_S (w3[1], w3[2], offset); @@ -9203,9 +9210,10 @@ inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 7: + case 7: c1[3] = amd_bytealign_S (w3[3], 0, offset); c1[2] = amd_bytealign_S (w3[2], w3[3], offset); c1[1] = amd_bytealign_S (w3[1], w3[2], offset); @@ -9230,9 +9238,10 @@ inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 8: + case 8: c2[0] = amd_bytealign_S (w3[3], 0, offset); c1[3] = amd_bytealign_S (w3[2], w3[3], offset); c1[2] = amd_bytealign_S (w3[1], w3[2], offset); @@ -9258,9 +9267,10 @@ inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; - case 9: + case 9: c2[1] = amd_bytealign_S (w3[3], 0, offset); c2[0] = amd_bytealign_S (w3[2], w3[3], offset); c1[3] = amd_bytealign_S (w3[1], w3[2], offset); @@ -9287,6 +9297,7 @@ inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; case 10: @@ -9317,6 +9328,7 @@ inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; case 11: @@ -9348,6 +9360,7 @@ inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; case 12: @@ -9380,6 +9393,7 @@ inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; case 13: @@ -9413,6 +9427,7 @@ inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; case 14: @@ -9447,6 +9462,7 @@ inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; case 15: @@ -9482,8 +9498,473 @@ inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], w0[2] = 0; w0[1] = 0; w0[0] = 0; + break; } + #endif + + #ifdef IS_NV + const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; + + switch (offset / 4) + { + case 0: + c0[0] = __byte_perm_S ( 0, w3[3], selector); + w3[3] = __byte_perm_S (w3[3], w3[2], selector); + w3[2] = __byte_perm_S (w3[2], w3[1], selector); + w3[1] = __byte_perm_S (w3[1], w3[0], selector); + w3[0] = __byte_perm_S (w3[0], w2[3], selector); + w2[3] = __byte_perm_S (w2[3], w2[2], selector); + w2[2] = __byte_perm_S (w2[2], w2[1], selector); + w2[1] = __byte_perm_S (w2[1], w2[0], selector); + w2[0] = __byte_perm_S (w2[0], w1[3], selector); + w1[3] = __byte_perm_S (w1[3], w1[2], selector); + w1[2] = __byte_perm_S (w1[2], w1[1], selector); + w1[1] = __byte_perm_S (w1[1], w1[0], selector); + w1[0] = __byte_perm_S (w1[0], w0[3], selector); + w0[3] = __byte_perm_S (w0[3], w0[2], selector); + w0[2] = __byte_perm_S (w0[2], w0[1], selector); + w0[1] = __byte_perm_S (w0[1], w0[0], selector); + w0[0] = __byte_perm_S (w0[0], 0, selector); + + break; + + case 1: + c0[1] = __byte_perm_S ( 0, w3[3], selector); + c0[0] = __byte_perm_S (w3[3], w3[2], selector); + w3[3] = __byte_perm_S (w3[2], w3[1], selector); + w3[2] = __byte_perm_S (w3[1], w3[0], selector); + w3[1] = __byte_perm_S (w3[0], w2[3], selector); + w3[0] = __byte_perm_S (w2[3], w2[2], selector); + w2[3] = __byte_perm_S (w2[2], w2[1], selector); + w2[2] = __byte_perm_S (w2[1], w2[0], selector); + w2[1] = __byte_perm_S (w2[0], w1[3], selector); + w2[0] = __byte_perm_S (w1[3], w1[2], selector); + w1[3] = __byte_perm_S (w1[2], w1[1], selector); + w1[2] = __byte_perm_S (w1[1], w1[0], selector); + w1[1] = __byte_perm_S (w1[0], w0[3], selector); + w1[0] = __byte_perm_S (w0[3], w0[2], selector); + w0[3] = __byte_perm_S (w0[2], w0[1], selector); + w0[2] = __byte_perm_S (w0[1], w0[0], selector); + w0[1] = __byte_perm_S (w0[0], 0, selector); + w0[0] = 0; + + break; + + case 2: + c0[2] = __byte_perm_S ( 0, w3[3], selector); + c0[1] = __byte_perm_S (w3[3], w3[2], selector); + c0[0] = __byte_perm_S (w3[2], w3[1], selector); + w3[3] = __byte_perm_S (w3[1], w3[0], selector); + w3[2] = __byte_perm_S (w3[0], w2[3], selector); + w3[1] = __byte_perm_S (w2[3], w2[2], selector); + w3[0] = __byte_perm_S (w2[2], w2[1], selector); + w2[3] = __byte_perm_S (w2[1], w2[0], selector); + w2[2] = __byte_perm_S (w2[0], w1[3], selector); + w2[1] = __byte_perm_S (w1[3], w1[2], selector); + w2[0] = __byte_perm_S (w1[2], w1[1], selector); + w1[3] = __byte_perm_S (w1[1], w1[0], selector); + w1[2] = __byte_perm_S (w1[0], w0[3], selector); + w1[1] = __byte_perm_S (w0[3], w0[2], selector); + w1[0] = __byte_perm_S (w0[2], w0[1], selector); + w0[3] = __byte_perm_S (w0[1], w0[0], selector); + w0[2] = __byte_perm_S (w0[0], 0, selector); + w0[1] = 0; + w0[0] = 0; + + break; + + case 3: + c0[3] = __byte_perm_S ( 0, w3[3], selector); + c0[2] = __byte_perm_S (w3[3], w3[2], selector); + c0[1] = __byte_perm_S (w3[2], w3[1], selector); + c0[0] = __byte_perm_S (w3[1], w3[0], selector); + w3[3] = __byte_perm_S (w3[0], w2[3], selector); + w3[2] = __byte_perm_S (w2[3], w2[2], selector); + w3[1] = __byte_perm_S (w2[2], w2[1], selector); + w3[0] = __byte_perm_S (w2[1], w2[0], selector); + w2[3] = __byte_perm_S (w2[0], w1[3], selector); + w2[2] = __byte_perm_S (w1[3], w1[2], selector); + w2[1] = __byte_perm_S (w1[2], w1[1], selector); + w2[0] = __byte_perm_S (w1[1], w1[0], selector); + w1[3] = __byte_perm_S (w1[0], w0[3], selector); + w1[2] = __byte_perm_S (w0[3], w0[2], selector); + w1[1] = __byte_perm_S (w0[2], w0[1], selector); + w1[0] = __byte_perm_S (w0[1], w0[0], selector); + w0[3] = __byte_perm_S (w0[0], 0, selector); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 4: + c1[0] = __byte_perm_S ( 0, w3[3], selector); + c0[3] = __byte_perm_S (w3[3], w3[2], selector); + c0[2] = __byte_perm_S (w3[2], w3[1], selector); + c0[1] = __byte_perm_S (w3[1], w3[0], selector); + c0[0] = __byte_perm_S (w3[0], w2[3], selector); + w3[3] = __byte_perm_S (w2[3], w2[2], selector); + w3[2] = __byte_perm_S (w2[2], w2[1], selector); + w3[1] = __byte_perm_S (w2[1], w2[0], selector); + w3[0] = __byte_perm_S (w2[0], w1[3], selector); + w2[3] = __byte_perm_S (w1[3], w1[2], selector); + w2[2] = __byte_perm_S (w1[2], w1[1], selector); + w2[1] = __byte_perm_S (w1[1], w1[0], selector); + w2[0] = __byte_perm_S (w1[0], w0[3], selector); + w1[3] = __byte_perm_S (w0[3], w0[2], selector); + w1[2] = __byte_perm_S (w0[2], w0[1], selector); + w1[1] = __byte_perm_S (w0[1], w0[0], selector); + w1[0] = __byte_perm_S (w0[0], 0, selector); + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 5: + c1[1] = __byte_perm_S ( 0, w3[3], selector); + c1[0] = __byte_perm_S (w3[3], w3[2], selector); + c0[3] = __byte_perm_S (w3[2], w3[1], selector); + c0[2] = __byte_perm_S (w3[1], w3[0], selector); + c0[1] = __byte_perm_S (w3[0], w2[3], selector); + c0[0] = __byte_perm_S (w2[3], w2[2], selector); + w3[3] = __byte_perm_S (w2[2], w2[1], selector); + w3[2] = __byte_perm_S (w2[1], w2[0], selector); + w3[1] = __byte_perm_S (w2[0], w1[3], selector); + w3[0] = __byte_perm_S (w1[3], w1[2], selector); + w2[3] = __byte_perm_S (w1[2], w1[1], selector); + w2[2] = __byte_perm_S (w1[1], w1[0], selector); + w2[1] = __byte_perm_S (w1[0], w0[3], selector); + w2[0] = __byte_perm_S (w0[3], w0[2], selector); + w1[3] = __byte_perm_S (w0[2], w0[1], selector); + w1[2] = __byte_perm_S (w0[1], w0[0], selector); + w1[1] = __byte_perm_S (w0[0], 0, selector); + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 6: + c1[2] = __byte_perm_S ( 0, w3[3], selector); + c1[1] = __byte_perm_S (w3[3], w3[2], selector); + c1[0] = __byte_perm_S (w3[2], w3[1], selector); + c0[3] = __byte_perm_S (w3[1], w3[0], selector); + c0[2] = __byte_perm_S (w3[0], w2[3], selector); + c0[1] = __byte_perm_S (w2[3], w2[2], selector); + c0[0] = __byte_perm_S (w2[2], w2[1], selector); + w3[3] = __byte_perm_S (w2[1], w2[0], selector); + w3[2] = __byte_perm_S (w2[0], w1[3], selector); + w3[1] = __byte_perm_S (w1[3], w1[2], selector); + w3[0] = __byte_perm_S (w1[2], w1[1], selector); + w2[3] = __byte_perm_S (w1[1], w1[0], selector); + w2[2] = __byte_perm_S (w1[0], w0[3], selector); + w2[1] = __byte_perm_S (w0[3], w0[2], selector); + w2[0] = __byte_perm_S (w0[2], w0[1], selector); + w1[3] = __byte_perm_S (w0[1], w0[0], selector); + w1[2] = __byte_perm_S (w0[0], 0, selector); + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 7: + c1[3] = __byte_perm_S ( 0, w3[3], selector); + c1[2] = __byte_perm_S (w3[3], w3[2], selector); + c1[1] = __byte_perm_S (w3[2], w3[1], selector); + c1[0] = __byte_perm_S (w3[1], w3[0], selector); + c0[3] = __byte_perm_S (w3[0], w2[3], selector); + c0[2] = __byte_perm_S (w2[3], w2[2], selector); + c0[1] = __byte_perm_S (w2[2], w2[1], selector); + c0[0] = __byte_perm_S (w2[1], w2[0], selector); + w3[3] = __byte_perm_S (w2[0], w1[3], selector); + w3[2] = __byte_perm_S (w1[3], w1[2], selector); + w3[1] = __byte_perm_S (w1[2], w1[1], selector); + w3[0] = __byte_perm_S (w1[1], w1[0], selector); + w2[3] = __byte_perm_S (w1[0], w0[3], selector); + w2[2] = __byte_perm_S (w0[3], w0[2], selector); + w2[1] = __byte_perm_S (w0[2], w0[1], selector); + w2[0] = __byte_perm_S (w0[1], w0[0], selector); + w1[3] = __byte_perm_S (w0[0], 0, selector); + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 8: + c2[0] = __byte_perm_S ( 0, w3[3], selector); + c1[3] = __byte_perm_S (w3[3], w3[2], selector); + c1[2] = __byte_perm_S (w3[2], w3[1], selector); + c1[1] = __byte_perm_S (w3[1], w3[0], selector); + c1[0] = __byte_perm_S (w3[0], w2[3], selector); + c0[3] = __byte_perm_S (w2[3], w2[2], selector); + c0[2] = __byte_perm_S (w2[2], w2[1], selector); + c0[1] = __byte_perm_S (w2[1], w2[0], selector); + c0[0] = __byte_perm_S (w2[0], w1[3], selector); + w3[3] = __byte_perm_S (w1[3], w1[2], selector); + w3[2] = __byte_perm_S (w1[2], w1[1], selector); + w3[1] = __byte_perm_S (w1[1], w1[0], selector); + w3[0] = __byte_perm_S (w1[0], w0[3], selector); + w2[3] = __byte_perm_S (w0[3], w0[2], selector); + w2[2] = __byte_perm_S (w0[2], w0[1], selector); + w2[1] = __byte_perm_S (w0[1], w0[0], selector); + w2[0] = __byte_perm_S (w0[0], 0, selector); + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 9: + c2[1] = __byte_perm_S ( 0, w3[3], selector); + c2[0] = __byte_perm_S (w3[3], w3[2], selector); + c1[3] = __byte_perm_S (w3[2], w3[1], selector); + c1[2] = __byte_perm_S (w3[1], w3[0], selector); + c1[1] = __byte_perm_S (w3[0], w2[3], selector); + c1[0] = __byte_perm_S (w2[3], w2[2], selector); + c0[3] = __byte_perm_S (w2[2], w2[1], selector); + c0[2] = __byte_perm_S (w2[1], w2[0], selector); + c0[1] = __byte_perm_S (w2[0], w1[3], selector); + c0[0] = __byte_perm_S (w1[3], w1[2], selector); + w3[3] = __byte_perm_S (w1[2], w1[1], selector); + w3[2] = __byte_perm_S (w1[1], w1[0], selector); + w3[1] = __byte_perm_S (w1[0], w0[3], selector); + w3[0] = __byte_perm_S (w0[3], w0[2], selector); + w2[3] = __byte_perm_S (w0[2], w0[1], selector); + w2[2] = __byte_perm_S (w0[1], w0[0], selector); + w2[1] = __byte_perm_S (w0[0], 0, selector); + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 10: + c2[2] = __byte_perm_S ( 0, w3[3], selector); + c2[1] = __byte_perm_S (w3[3], w3[2], selector); + c2[0] = __byte_perm_S (w3[2], w3[1], selector); + c1[3] = __byte_perm_S (w3[1], w3[0], selector); + c1[2] = __byte_perm_S (w3[0], w2[3], selector); + c1[1] = __byte_perm_S (w2[3], w2[2], selector); + c1[0] = __byte_perm_S (w2[2], w2[1], selector); + c0[3] = __byte_perm_S (w2[1], w2[0], selector); + c0[2] = __byte_perm_S (w2[0], w1[3], selector); + c0[1] = __byte_perm_S (w1[3], w1[2], selector); + c0[0] = __byte_perm_S (w1[2], w1[1], selector); + w3[3] = __byte_perm_S (w1[1], w1[0], selector); + w3[2] = __byte_perm_S (w1[0], w0[3], selector); + w3[1] = __byte_perm_S (w0[3], w0[2], selector); + w3[0] = __byte_perm_S (w0[2], w0[1], selector); + w2[3] = __byte_perm_S (w0[1], w0[0], selector); + w2[2] = __byte_perm_S (w0[0], 0, selector); + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 11: + c2[3] = __byte_perm_S ( 0, w3[3], selector); + c2[2] = __byte_perm_S (w3[3], w3[2], selector); + c2[1] = __byte_perm_S (w3[2], w3[1], selector); + c2[0] = __byte_perm_S (w3[1], w3[0], selector); + c1[3] = __byte_perm_S (w3[0], w2[3], selector); + c1[2] = __byte_perm_S (w2[3], w2[2], selector); + c1[1] = __byte_perm_S (w2[2], w2[1], selector); + c1[0] = __byte_perm_S (w2[1], w2[0], selector); + c0[3] = __byte_perm_S (w2[0], w1[3], selector); + c0[2] = __byte_perm_S (w1[3], w1[2], selector); + c0[1] = __byte_perm_S (w1[2], w1[1], selector); + c0[0] = __byte_perm_S (w1[1], w1[0], selector); + w3[3] = __byte_perm_S (w1[0], w0[3], selector); + w3[2] = __byte_perm_S (w0[3], w0[2], selector); + w3[1] = __byte_perm_S (w0[2], w0[1], selector); + w3[0] = __byte_perm_S (w0[1], w0[0], selector); + w2[3] = __byte_perm_S (w0[0], 0, selector); + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 12: + c3[0] = __byte_perm_S ( 0, w3[3], selector); + c2[3] = __byte_perm_S (w3[3], w3[2], selector); + c2[2] = __byte_perm_S (w3[2], w3[1], selector); + c2[1] = __byte_perm_S (w3[1], w3[0], selector); + c2[0] = __byte_perm_S (w3[0], w2[3], selector); + c1[3] = __byte_perm_S (w2[3], w2[2], selector); + c1[2] = __byte_perm_S (w2[2], w2[1], selector); + c1[1] = __byte_perm_S (w2[1], w2[0], selector); + c1[0] = __byte_perm_S (w2[0], w1[3], selector); + c0[3] = __byte_perm_S (w1[3], w1[2], selector); + c0[2] = __byte_perm_S (w1[2], w1[1], selector); + c0[1] = __byte_perm_S (w1[1], w1[0], selector); + c0[0] = __byte_perm_S (w1[0], w0[3], selector); + w3[3] = __byte_perm_S (w0[3], w0[2], selector); + w3[2] = __byte_perm_S (w0[2], w0[1], selector); + w3[1] = __byte_perm_S (w0[1], w0[0], selector); + w3[0] = __byte_perm_S (w0[0], 0, selector); + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 13: + c3[1] = __byte_perm_S ( 0, w3[3], selector); + c3[0] = __byte_perm_S (w3[3], w3[2], selector); + c2[3] = __byte_perm_S (w3[2], w3[1], selector); + c2[2] = __byte_perm_S (w3[1], w3[0], selector); + c2[1] = __byte_perm_S (w3[0], w2[3], selector); + c2[0] = __byte_perm_S (w2[3], w2[2], selector); + c1[3] = __byte_perm_S (w2[2], w2[1], selector); + c1[2] = __byte_perm_S (w2[1], w2[0], selector); + c1[1] = __byte_perm_S (w2[0], w1[3], selector); + c1[0] = __byte_perm_S (w1[3], w1[2], selector); + c0[3] = __byte_perm_S (w1[2], w1[1], selector); + c0[2] = __byte_perm_S (w1[1], w1[0], selector); + c0[1] = __byte_perm_S (w1[0], w0[3], selector); + c0[0] = __byte_perm_S (w0[3], w0[2], selector); + w3[3] = __byte_perm_S (w0[2], w0[1], selector); + w3[2] = __byte_perm_S (w0[1], w0[0], selector); + w3[1] = __byte_perm_S (w0[0], 0, selector); + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 14: + c3[2] = __byte_perm_S ( 0, w3[3], selector); + c3[1] = __byte_perm_S (w3[3], w3[2], selector); + c3[0] = __byte_perm_S (w3[2], w3[1], selector); + c2[3] = __byte_perm_S (w3[1], w3[0], selector); + c2[2] = __byte_perm_S (w3[0], w2[3], selector); + c2[1] = __byte_perm_S (w2[3], w2[2], selector); + c2[0] = __byte_perm_S (w2[2], w2[1], selector); + c1[3] = __byte_perm_S (w2[1], w2[0], selector); + c1[2] = __byte_perm_S (w2[0], w1[3], selector); + c1[1] = __byte_perm_S (w1[3], w1[2], selector); + c1[0] = __byte_perm_S (w1[2], w1[1], selector); + c0[3] = __byte_perm_S (w1[1], w1[0], selector); + c0[2] = __byte_perm_S (w1[0], w0[3], selector); + c0[1] = __byte_perm_S (w0[3], w0[2], selector); + c0[0] = __byte_perm_S (w0[2], w0[1], selector); + w3[3] = __byte_perm_S (w0[1], w0[0], selector); + w3[2] = __byte_perm_S (w0[0], 0, selector); + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + + case 15: + c3[3] = __byte_perm_S ( 0, w3[3], selector); + c3[2] = __byte_perm_S (w3[3], w3[2], selector); + c3[1] = __byte_perm_S (w3[2], w3[1], selector); + c3[0] = __byte_perm_S (w3[1], w3[0], selector); + c2[3] = __byte_perm_S (w3[0], w2[3], selector); + c2[2] = __byte_perm_S (w2[3], w2[2], selector); + c2[1] = __byte_perm_S (w2[2], w2[1], selector); + c2[0] = __byte_perm_S (w2[1], w2[0], selector); + c1[3] = __byte_perm_S (w2[0], w1[3], selector); + c1[2] = __byte_perm_S (w1[3], w1[2], selector); + c1[1] = __byte_perm_S (w1[2], w1[1], selector); + c1[0] = __byte_perm_S (w1[1], w1[0], selector); + c0[3] = __byte_perm_S (w1[0], w0[3], selector); + c0[2] = __byte_perm_S (w0[3], w0[2], selector); + c0[1] = __byte_perm_S (w0[2], w0[1], selector); + c0[0] = __byte_perm_S (w0[1], w0[0], selector); + w3[3] = __byte_perm_S (w0[0], 0, selector); + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + break; + } + #endif } inline void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset)