diff --git a/OpenCL/inc_common.cl b/OpenCL/inc_common.cl index ff41c41df..abe27347a 100644 --- a/OpenCL/inc_common.cl +++ b/OpenCL/inc_common.cl @@ -6947,6 +6947,1619 @@ inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w #endif } +inline void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset) +{ + #if defined IS_AMD || defined IS_GENERIC + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset; + + switch (offset / 4) + { + case 0: + w7[3] = amd_bytealign_S (w7[3], w7[2], offset_minus_4); + w7[2] = amd_bytealign_S (w7[2], w7[1], offset_minus_4); + w7[1] = amd_bytealign_S (w7[1], w7[0], offset_minus_4); + w7[0] = amd_bytealign_S (w7[0], w6[3], offset_minus_4); + w6[3] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); + w6[2] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); + w6[1] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); + w6[0] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); + w5[3] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); + w5[2] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w5[1] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w5[0] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w4[3] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w4[2] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w4[1] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w4[0] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w3[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w3[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w3[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w3[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w2[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w2[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w2[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w2[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w1[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w1[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w1[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w1[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w0[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w0[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w0[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w0[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + + if (offset_mod_4 == 0) + { + w0[0] = w0[1]; + w0[1] = w0[2]; + w0[2] = w0[3]; + w0[3] = w1[0]; + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 1: + w7[3] = amd_bytealign_S (w7[2], w7[1], offset_minus_4); + w7[2] = amd_bytealign_S (w7[1], w7[0], offset_minus_4); + w7[1] = amd_bytealign_S (w7[0], w6[3], offset_minus_4); + w7[0] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); + w6[3] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); + w6[2] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); + w6[1] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); + w6[0] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); + w5[3] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w5[2] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w5[1] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w5[0] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w4[3] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w4[2] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w4[1] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w4[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w3[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w3[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w3[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w3[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w2[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w2[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w2[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w2[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w1[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w1[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w1[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w1[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w0[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w0[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w0[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w0[1] = w0[2]; + w0[2] = w0[3]; + w0[3] = w1[0]; + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 2: + w7[3] = amd_bytealign_S (w7[1], w7[0], offset_minus_4); + w7[2] = amd_bytealign_S (w7[0], w6[3], offset_minus_4); + w7[1] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); + w7[0] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); + w6[3] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); + w6[2] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); + w6[1] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); + w6[0] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w5[3] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w5[2] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w5[1] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w5[0] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w4[3] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w4[2] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w4[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w4[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w3[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w3[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w3[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w3[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w2[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w2[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w2[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w2[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w1[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w1[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w1[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w1[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w0[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w0[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w0[2] = w0[3]; + w0[3] = w1[0]; + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 3: + w7[3] = amd_bytealign_S (w7[0], w6[3], offset_minus_4); + w7[2] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); + w7[1] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); + w7[0] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); + w6[3] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); + w6[2] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); + w6[1] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w6[0] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w5[3] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w5[2] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w5[1] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w5[0] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w4[3] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w4[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w4[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w4[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w3[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w3[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w3[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w3[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w2[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w2[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w2[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w2[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w1[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w1[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w1[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w1[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w0[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w0[3] = w1[0]; + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 4: + w7[3] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); + w7[2] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); + w7[1] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); + w7[0] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); + w6[3] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); + w6[2] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w6[1] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w6[0] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w5[3] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w5[2] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w5[1] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w5[0] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w4[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w4[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w4[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w4[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w3[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w3[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w3[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w3[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w2[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w2[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w2[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w2[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w1[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w1[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w1[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w1[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w1[0] = w1[1]; + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 5: + w7[3] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); + w7[2] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); + w7[1] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); + w7[0] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); + w6[3] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w6[2] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w6[1] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w6[0] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w5[3] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w5[2] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w5[1] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w5[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w4[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w4[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w4[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w4[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w3[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w3[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w3[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w3[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w2[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w2[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w2[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w2[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w1[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w1[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w1[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w1[1] = w1[2]; + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 6: + w7[3] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); + w7[2] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); + w7[1] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); + w7[0] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w6[3] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w6[2] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w6[1] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w6[0] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w5[3] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w5[2] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w5[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w5[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w4[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w4[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w4[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w4[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w3[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w3[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w3[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w3[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w2[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w2[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w2[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w2[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w1[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w1[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w1[2] = w1[3]; + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 7: + w7[3] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); + w7[2] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); + w7[1] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w7[0] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w6[3] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w6[2] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w6[1] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w6[0] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w5[3] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w5[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w5[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w5[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w4[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w4[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w4[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w4[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w3[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w3[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w3[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w3[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w2[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w2[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w2[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w2[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w1[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w1[3] = w2[0]; + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 8: + w7[3] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); + w7[2] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w7[1] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w7[0] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w6[3] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w6[2] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w6[1] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w6[0] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w5[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w5[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w5[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w5[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w4[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w4[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w4[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w4[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w3[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w3[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w3[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w3[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w2[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w2[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w2[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w2[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w2[0] = w2[1]; + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 9: + w7[3] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); + w7[2] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w7[1] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w7[0] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w6[3] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w6[2] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w6[1] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w6[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w5[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w5[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w5[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w5[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w4[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w4[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w4[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w4[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w3[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w3[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w3[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w3[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w2[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w2[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w2[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w2[1] = w2[2]; + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 10: + w7[3] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); + w7[2] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w7[1] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w7[0] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w6[3] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w6[2] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w6[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w6[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w5[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w5[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w5[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w5[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w4[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w4[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w4[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w4[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w3[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w3[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w3[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w3[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w2[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w2[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w2[2] = w2[3]; + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 11: + w7[3] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); + w7[2] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w7[1] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w7[0] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w6[3] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w6[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w6[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w6[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w5[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w5[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w5[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w5[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w4[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w4[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w4[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w4[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w3[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w3[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w3[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w3[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w2[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w2[3] = w3[0]; + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 12: + w7[3] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); + w7[2] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w7[1] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w7[0] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w6[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w6[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w6[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w6[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w5[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w5[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w5[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w5[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w4[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w4[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w4[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w4[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w3[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w3[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w3[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w3[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w3[0] = w3[1]; + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 13: + w7[3] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); + w7[2] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w7[1] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w7[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w6[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w6[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w6[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w6[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w5[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w5[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w5[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w5[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w4[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w4[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w4[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w4[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w3[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w3[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w3[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w3[1] = w3[2]; + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 14: + w7[3] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); + w7[2] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w7[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w7[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w6[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w6[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w6[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w6[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w5[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w5[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w5[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w5[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w4[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w4[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w4[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w4[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w3[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w3[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w3[2] = w3[3]; + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + + case 15: + w7[3] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); + w7[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); + w7[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); + w7[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); + w6[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); + w6[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); + w6[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); + w6[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); + w5[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); + w5[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); + w5[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); + w5[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); + w4[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); + w4[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); + w4[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); + w4[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); + w3[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + + if (offset_mod_4 == 0) + { + w3[3] = w4[0]; + w4[0] = w4[1]; + w4[1] = w4[2]; + w4[2] = w4[3]; + w4[3] = w5[0]; + w5[0] = w5[1]; + w5[1] = w5[2]; + w5[2] = w5[3]; + w5[3] = w6[0]; + w6[0] = w6[1]; + w6[1] = w6[2]; + w6[2] = w6[3]; + w6[3] = w7[0]; + w7[0] = w7[1]; + w7[1] = w7[2]; + w7[2] = w7[3]; + w7[3] = 0; + } + + break; + } + #endif + + #ifdef IS_NV + const int offset_minus_4 = 4 - (offset % 4); + + const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; + + switch (offset / 4) + { + case 0: + w7[3] = __byte_perm_S (w7[2], w7[3], selector); + w7[2] = __byte_perm_S (w7[1], w7[2], selector); + w7[1] = __byte_perm_S (w7[0], w7[1], selector); + w7[0] = __byte_perm_S (w6[3], w7[0], selector); + w6[3] = __byte_perm_S (w6[2], w6[3], selector); + w6[2] = __byte_perm_S (w6[1], w6[2], selector); + w6[1] = __byte_perm_S (w6[0], w6[1], selector); + w6[0] = __byte_perm_S (w5[3], w6[0], selector); + w5[3] = __byte_perm_S (w5[2], w5[3], selector); + w5[2] = __byte_perm_S (w5[1], w5[2], selector); + w5[1] = __byte_perm_S (w5[0], w5[1], selector); + w5[0] = __byte_perm_S (w4[3], w5[0], selector); + w4[3] = __byte_perm_S (w4[2], w4[3], selector); + w4[2] = __byte_perm_S (w4[1], w4[2], selector); + w4[1] = __byte_perm_S (w4[0], w4[1], selector); + w4[0] = __byte_perm_S (w3[3], w4[0], selector); + w3[3] = __byte_perm_S (w3[2], w3[3], selector); + w3[2] = __byte_perm_S (w3[1], w3[2], selector); + w3[1] = __byte_perm_S (w3[0], w3[1], selector); + w3[0] = __byte_perm_S (w2[3], w3[0], selector); + w2[3] = __byte_perm_S (w2[2], w2[3], selector); + w2[2] = __byte_perm_S (w2[1], w2[2], selector); + w2[1] = __byte_perm_S (w2[0], w2[1], selector); + w2[0] = __byte_perm_S (w1[3], w2[0], selector); + w1[3] = __byte_perm_S (w1[2], w1[3], selector); + w1[2] = __byte_perm_S (w1[1], w1[2], selector); + w1[1] = __byte_perm_S (w1[0], w1[1], selector); + w1[0] = __byte_perm_S (w0[3], w1[0], selector); + w0[3] = __byte_perm_S (w0[2], w0[3], selector); + w0[2] = __byte_perm_S (w0[1], w0[2], selector); + w0[1] = __byte_perm_S (w0[0], w0[1], selector); + w0[0] = __byte_perm_S ( 0, w0[0], selector); + break; + + case 1: + w7[3] = __byte_perm_S (w7[1], w7[2], selector); + w7[2] = __byte_perm_S (w7[0], w7[1], selector); + w7[1] = __byte_perm_S (w6[3], w7[0], selector); + w7[0] = __byte_perm_S (w6[2], w6[3], selector); + w6[3] = __byte_perm_S (w6[1], w6[2], selector); + w6[2] = __byte_perm_S (w6[0], w6[1], selector); + w6[1] = __byte_perm_S (w5[3], w6[0], selector); + w6[0] = __byte_perm_S (w5[2], w5[3], selector); + w5[3] = __byte_perm_S (w5[1], w5[2], selector); + w5[2] = __byte_perm_S (w5[0], w5[1], selector); + w5[1] = __byte_perm_S (w4[3], w5[0], selector); + w5[0] = __byte_perm_S (w4[2], w4[3], selector); + w4[3] = __byte_perm_S (w4[1], w4[2], selector); + w4[2] = __byte_perm_S (w4[0], w4[1], selector); + w4[1] = __byte_perm_S (w3[3], w4[0], selector); + w4[0] = __byte_perm_S (w3[2], w3[3], selector); + w3[3] = __byte_perm_S (w3[1], w3[2], selector); + w3[2] = __byte_perm_S (w3[0], w3[1], selector); + w3[1] = __byte_perm_S (w2[3], w3[0], selector); + w3[0] = __byte_perm_S (w2[2], w2[3], selector); + w2[3] = __byte_perm_S (w2[1], w2[2], selector); + w2[2] = __byte_perm_S (w2[0], w2[1], selector); + w2[1] = __byte_perm_S (w1[3], w2[0], selector); + w2[0] = __byte_perm_S (w1[2], w1[3], selector); + w1[3] = __byte_perm_S (w1[1], w1[2], selector); + w1[2] = __byte_perm_S (w1[0], w1[1], selector); + w1[1] = __byte_perm_S (w0[3], w1[0], selector); + w1[0] = __byte_perm_S (w0[2], w0[3], selector); + w0[3] = __byte_perm_S (w0[1], w0[2], selector); + w0[2] = __byte_perm_S (w0[0], w0[1], selector); + w0[1] = __byte_perm_S ( 0, w0[0], selector); + w0[0] = 0; + break; + + case 2: + w7[3] = __byte_perm_S (w7[0], w7[1], selector); + w7[2] = __byte_perm_S (w6[3], w7[0], selector); + w7[1] = __byte_perm_S (w6[2], w6[3], selector); + w7[0] = __byte_perm_S (w6[1], w6[2], selector); + w6[3] = __byte_perm_S (w6[0], w6[1], selector); + w6[2] = __byte_perm_S (w5[3], w6[0], selector); + w6[1] = __byte_perm_S (w5[2], w5[3], selector); + w6[0] = __byte_perm_S (w5[1], w5[2], selector); + w5[3] = __byte_perm_S (w5[0], w5[1], selector); + w5[2] = __byte_perm_S (w4[3], w5[0], selector); + w5[1] = __byte_perm_S (w4[2], w4[3], selector); + w5[0] = __byte_perm_S (w4[1], w4[2], selector); + w4[3] = __byte_perm_S (w4[0], w4[1], selector); + w4[2] = __byte_perm_S (w3[3], w4[0], selector); + w4[1] = __byte_perm_S (w3[2], w3[3], selector); + w4[0] = __byte_perm_S (w3[1], w3[2], selector); + w3[3] = __byte_perm_S (w3[0], w3[1], selector); + w3[2] = __byte_perm_S (w2[3], w3[0], selector); + w3[1] = __byte_perm_S (w2[2], w2[3], selector); + w3[0] = __byte_perm_S (w2[1], w2[2], selector); + w2[3] = __byte_perm_S (w2[0], w2[1], selector); + w2[2] = __byte_perm_S (w1[3], w2[0], selector); + w2[1] = __byte_perm_S (w1[2], w1[3], selector); + w2[0] = __byte_perm_S (w1[1], w1[2], selector); + w1[3] = __byte_perm_S (w1[0], w1[1], selector); + w1[2] = __byte_perm_S (w0[3], w1[0], selector); + w1[1] = __byte_perm_S (w0[2], w0[3], selector); + w1[0] = __byte_perm_S (w0[1], w0[2], selector); + w0[3] = __byte_perm_S (w0[0], w0[1], selector); + w0[2] = __byte_perm_S ( 0, w0[0], selector); + w0[1] = 0; + w0[0] = 0; + break; + + case 3: + w7[3] = __byte_perm_S (w6[3], w7[0], selector); + w7[2] = __byte_perm_S (w6[2], w6[3], selector); + w7[1] = __byte_perm_S (w6[1], w6[2], selector); + w7[0] = __byte_perm_S (w6[0], w6[1], selector); + w6[3] = __byte_perm_S (w5[3], w6[0], selector); + w6[2] = __byte_perm_S (w5[2], w5[3], selector); + w6[1] = __byte_perm_S (w5[1], w5[2], selector); + w6[0] = __byte_perm_S (w5[0], w5[1], selector); + w5[3] = __byte_perm_S (w4[3], w5[0], selector); + w5[2] = __byte_perm_S (w4[2], w4[3], selector); + w5[1] = __byte_perm_S (w4[1], w4[2], selector); + w5[0] = __byte_perm_S (w4[0], w4[1], selector); + w4[3] = __byte_perm_S (w3[3], w4[0], selector); + w4[2] = __byte_perm_S (w3[2], w3[3], selector); + w4[1] = __byte_perm_S (w3[1], w3[2], selector); + w4[0] = __byte_perm_S (w3[0], w3[1], selector); + w3[3] = __byte_perm_S (w2[3], w3[0], selector); + w3[2] = __byte_perm_S (w2[2], w2[3], selector); + w3[1] = __byte_perm_S (w2[1], w2[2], selector); + w3[0] = __byte_perm_S (w2[0], w2[1], selector); + w2[3] = __byte_perm_S (w1[3], w2[0], selector); + w2[2] = __byte_perm_S (w1[2], w1[3], selector); + w2[1] = __byte_perm_S (w1[1], w1[2], selector); + w2[0] = __byte_perm_S (w1[0], w1[1], selector); + w1[3] = __byte_perm_S (w0[3], w1[0], selector); + w1[2] = __byte_perm_S (w0[2], w0[3], selector); + w1[1] = __byte_perm_S (w0[1], w0[2], selector); + w1[0] = __byte_perm_S (w0[0], w0[1], selector); + w0[3] = __byte_perm_S ( 0, w0[0], selector); + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + case 4: + w7[3] = __byte_perm_S (w6[2], w6[3], selector); + w7[2] = __byte_perm_S (w6[1], w6[2], selector); + w7[1] = __byte_perm_S (w6[0], w6[1], selector); + w7[0] = __byte_perm_S (w5[3], w6[0], selector); + w6[3] = __byte_perm_S (w5[2], w5[3], selector); + w6[2] = __byte_perm_S (w5[1], w5[2], selector); + w6[1] = __byte_perm_S (w5[0], w5[1], selector); + w6[0] = __byte_perm_S (w4[3], w5[0], selector); + w5[3] = __byte_perm_S (w4[2], w4[3], selector); + w5[2] = __byte_perm_S (w4[1], w4[2], selector); + w5[1] = __byte_perm_S (w4[0], w4[1], selector); + w5[0] = __byte_perm_S (w3[3], w4[0], selector); + w4[3] = __byte_perm_S (w3[2], w3[3], selector); + w4[2] = __byte_perm_S (w3[1], w3[2], selector); + w4[1] = __byte_perm_S (w3[0], w3[1], selector); + w4[0] = __byte_perm_S (w2[3], w3[0], selector); + w3[3] = __byte_perm_S (w2[2], w2[3], selector); + w3[2] = __byte_perm_S (w2[1], w2[2], selector); + w3[1] = __byte_perm_S (w2[0], w2[1], selector); + w3[0] = __byte_perm_S (w1[3], w2[0], selector); + w2[3] = __byte_perm_S (w1[2], w1[3], selector); + w2[2] = __byte_perm_S (w1[1], w1[2], selector); + w2[1] = __byte_perm_S (w1[0], w1[1], selector); + w2[0] = __byte_perm_S (w0[3], w1[0], selector); + w1[3] = __byte_perm_S (w0[2], w0[3], selector); + w1[2] = __byte_perm_S (w0[1], w0[2], selector); + w1[1] = __byte_perm_S (w0[0], w0[1], selector); + w1[0] = __byte_perm_S ( 0, w0[0], selector); + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + case 5: + w7[3] = __byte_perm_S (w6[1], w6[2], selector); + w7[2] = __byte_perm_S (w6[0], w6[1], selector); + w7[1] = __byte_perm_S (w5[3], w6[0], selector); + w7[0] = __byte_perm_S (w5[2], w5[3], selector); + w6[3] = __byte_perm_S (w5[1], w5[2], selector); + w6[2] = __byte_perm_S (w5[0], w5[1], selector); + w6[1] = __byte_perm_S (w4[3], w5[0], selector); + w6[0] = __byte_perm_S (w4[2], w4[3], selector); + w5[3] = __byte_perm_S (w4[1], w4[2], selector); + w5[2] = __byte_perm_S (w4[0], w4[1], selector); + w5[1] = __byte_perm_S (w3[3], w4[0], selector); + w5[0] = __byte_perm_S (w3[2], w3[3], selector); + w4[3] = __byte_perm_S (w3[1], w3[2], selector); + w4[2] = __byte_perm_S (w3[0], w3[1], selector); + w4[1] = __byte_perm_S (w2[3], w3[0], selector); + w4[0] = __byte_perm_S (w2[2], w2[3], selector); + w3[3] = __byte_perm_S (w2[1], w2[2], selector); + w3[2] = __byte_perm_S (w2[0], w2[1], selector); + w3[1] = __byte_perm_S (w1[3], w2[0], selector); + w3[0] = __byte_perm_S (w1[2], w1[3], selector); + w2[3] = __byte_perm_S (w1[1], w1[2], selector); + w2[2] = __byte_perm_S (w1[0], w1[1], selector); + w2[1] = __byte_perm_S (w0[3], w1[0], selector); + w2[0] = __byte_perm_S (w0[2], w0[3], selector); + w1[3] = __byte_perm_S (w0[1], w0[2], selector); + w1[2] = __byte_perm_S (w0[0], w0[1], selector); + w1[1] = __byte_perm_S ( 0, w0[0], selector); + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + case 6: + w7[3] = __byte_perm_S (w6[0], w6[1], selector); + w7[2] = __byte_perm_S (w5[3], w6[0], selector); + w7[1] = __byte_perm_S (w5[2], w5[3], selector); + w7[0] = __byte_perm_S (w5[1], w5[2], selector); + w6[3] = __byte_perm_S (w5[0], w5[1], selector); + w6[2] = __byte_perm_S (w4[3], w5[0], selector); + w6[1] = __byte_perm_S (w4[2], w4[3], selector); + w6[0] = __byte_perm_S (w4[1], w4[2], selector); + w5[3] = __byte_perm_S (w4[0], w4[1], selector); + w5[2] = __byte_perm_S (w3[3], w4[0], selector); + w5[1] = __byte_perm_S (w3[2], w3[3], selector); + w5[0] = __byte_perm_S (w3[1], w3[2], selector); + w4[3] = __byte_perm_S (w3[0], w3[1], selector); + w4[2] = __byte_perm_S (w2[3], w3[0], selector); + w4[1] = __byte_perm_S (w2[2], w2[3], selector); + w4[0] = __byte_perm_S (w2[1], w2[2], selector); + w3[3] = __byte_perm_S (w2[0], w2[1], selector); + w3[2] = __byte_perm_S (w1[3], w2[0], selector); + w3[1] = __byte_perm_S (w1[2], w1[3], selector); + w3[0] = __byte_perm_S (w1[1], w1[2], selector); + w2[3] = __byte_perm_S (w1[0], w1[1], selector); + w2[2] = __byte_perm_S (w0[3], w1[0], selector); + w2[1] = __byte_perm_S (w0[2], w0[3], selector); + w2[0] = __byte_perm_S (w0[1], w0[2], selector); + w1[3] = __byte_perm_S (w0[0], w0[1], selector); + w1[2] = __byte_perm_S ( 0, w0[0], selector); + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + case 7: + w7[3] = __byte_perm_S (w5[3], w6[0], selector); + w7[2] = __byte_perm_S (w5[2], w5[3], selector); + w7[1] = __byte_perm_S (w5[1], w5[2], selector); + w7[0] = __byte_perm_S (w5[0], w5[1], selector); + w6[3] = __byte_perm_S (w4[3], w5[0], selector); + w6[2] = __byte_perm_S (w4[2], w4[3], selector); + w6[1] = __byte_perm_S (w4[1], w4[2], selector); + w6[0] = __byte_perm_S (w4[0], w4[1], selector); + w5[3] = __byte_perm_S (w3[3], w4[0], selector); + w5[2] = __byte_perm_S (w3[2], w3[3], selector); + w5[1] = __byte_perm_S (w3[1], w3[2], selector); + w5[0] = __byte_perm_S (w3[0], w3[1], selector); + w4[3] = __byte_perm_S (w2[3], w3[0], selector); + w4[2] = __byte_perm_S (w2[2], w2[3], selector); + w4[1] = __byte_perm_S (w2[1], w2[2], selector); + w4[0] = __byte_perm_S (w2[0], w2[1], selector); + w3[3] = __byte_perm_S (w1[3], w2[0], selector); + w3[2] = __byte_perm_S (w1[2], w1[3], selector); + w3[1] = __byte_perm_S (w1[1], w1[2], selector); + w3[0] = __byte_perm_S (w1[0], w1[1], selector); + w2[3] = __byte_perm_S (w0[3], w1[0], selector); + w2[2] = __byte_perm_S (w0[2], w0[3], selector); + w2[1] = __byte_perm_S (w0[1], w0[2], selector); + w2[0] = __byte_perm_S (w0[0], w0[1], selector); + w1[3] = __byte_perm_S ( 0, w0[0], selector); + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + case 8: + w7[3] = __byte_perm_S (w5[2], w5[3], selector); + w7[2] = __byte_perm_S (w5[1], w5[2], selector); + w7[1] = __byte_perm_S (w5[0], w5[1], selector); + w7[0] = __byte_perm_S (w4[3], w5[0], selector); + w6[3] = __byte_perm_S (w4[2], w4[3], selector); + w6[2] = __byte_perm_S (w4[1], w4[2], selector); + w6[1] = __byte_perm_S (w4[0], w4[1], selector); + w6[0] = __byte_perm_S (w3[3], w4[0], selector); + w5[3] = __byte_perm_S (w3[2], w3[3], selector); + w5[2] = __byte_perm_S (w3[1], w3[2], selector); + w5[1] = __byte_perm_S (w3[0], w3[1], selector); + w5[0] = __byte_perm_S (w2[3], w3[0], selector); + w4[3] = __byte_perm_S (w2[2], w2[3], selector); + w4[2] = __byte_perm_S (w2[1], w2[2], selector); + w4[1] = __byte_perm_S (w2[0], w2[1], selector); + w4[0] = __byte_perm_S (w1[3], w2[0], selector); + w3[3] = __byte_perm_S (w1[2], w1[3], selector); + w3[2] = __byte_perm_S (w1[1], w1[2], selector); + w3[1] = __byte_perm_S (w1[0], w1[1], selector); + w3[0] = __byte_perm_S (w0[3], w1[0], selector); + w2[3] = __byte_perm_S (w0[2], w0[3], selector); + w2[2] = __byte_perm_S (w0[1], w0[2], selector); + w2[1] = __byte_perm_S (w0[0], w0[1], selector); + w2[0] = __byte_perm_S ( 0, w0[0], selector); + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + case 9: + w7[3] = __byte_perm_S (w5[1], w5[2], selector); + w7[2] = __byte_perm_S (w5[0], w5[1], selector); + w7[1] = __byte_perm_S (w4[3], w5[0], selector); + w7[0] = __byte_perm_S (w4[2], w4[3], selector); + w6[3] = __byte_perm_S (w4[1], w4[2], selector); + w6[2] = __byte_perm_S (w4[0], w4[1], selector); + w6[1] = __byte_perm_S (w3[3], w4[0], selector); + w6[0] = __byte_perm_S (w3[2], w3[3], selector); + w5[3] = __byte_perm_S (w3[1], w3[2], selector); + w5[2] = __byte_perm_S (w3[0], w3[1], selector); + w5[1] = __byte_perm_S (w2[3], w3[0], selector); + w5[0] = __byte_perm_S (w2[2], w2[3], selector); + w4[3] = __byte_perm_S (w2[1], w2[2], selector); + w4[2] = __byte_perm_S (w2[0], w2[1], selector); + w4[1] = __byte_perm_S (w1[3], w2[0], selector); + w4[0] = __byte_perm_S (w1[2], w1[3], selector); + w3[3] = __byte_perm_S (w1[1], w1[2], selector); + w3[2] = __byte_perm_S (w1[0], w1[1], selector); + w3[1] = __byte_perm_S (w0[3], w1[0], selector); + w3[0] = __byte_perm_S (w0[2], w0[3], selector); + w2[3] = __byte_perm_S (w0[1], w0[2], selector); + w2[2] = __byte_perm_S (w0[0], w0[1], selector); + w2[1] = __byte_perm_S ( 0, w0[0], selector); + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + case 10: + w7[3] = __byte_perm_S (w5[0], w5[1], selector); + w7[2] = __byte_perm_S (w4[3], w5[0], selector); + w7[1] = __byte_perm_S (w4[2], w4[3], selector); + w7[0] = __byte_perm_S (w4[1], w4[2], selector); + w6[3] = __byte_perm_S (w4[0], w4[1], selector); + w6[2] = __byte_perm_S (w3[3], w4[0], selector); + w6[1] = __byte_perm_S (w3[2], w3[3], selector); + w6[0] = __byte_perm_S (w3[1], w3[2], selector); + w5[3] = __byte_perm_S (w3[0], w3[1], selector); + w5[2] = __byte_perm_S (w2[3], w3[0], selector); + w5[1] = __byte_perm_S (w2[2], w2[3], selector); + w5[0] = __byte_perm_S (w2[1], w2[2], selector); + w4[3] = __byte_perm_S (w2[0], w2[1], selector); + w4[2] = __byte_perm_S (w1[3], w2[0], selector); + w4[1] = __byte_perm_S (w1[2], w1[3], selector); + w4[0] = __byte_perm_S (w1[1], w1[2], selector); + w3[3] = __byte_perm_S (w1[0], w1[1], selector); + w3[2] = __byte_perm_S (w0[3], w1[0], selector); + w3[1] = __byte_perm_S (w0[2], w0[3], selector); + w3[0] = __byte_perm_S (w0[1], w0[2], selector); + w2[3] = __byte_perm_S (w0[0], w0[1], selector); + w2[2] = __byte_perm_S ( 0, w0[0], selector); + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + case 11: + w7[3] = __byte_perm_S (w4[3], w5[0], selector); + w7[2] = __byte_perm_S (w4[2], w4[3], selector); + w7[1] = __byte_perm_S (w4[1], w4[2], selector); + w7[0] = __byte_perm_S (w4[0], w4[1], selector); + w6[3] = __byte_perm_S (w3[3], w4[0], selector); + w6[2] = __byte_perm_S (w3[2], w3[3], selector); + w6[1] = __byte_perm_S (w3[1], w3[2], selector); + w6[0] = __byte_perm_S (w3[0], w3[1], selector); + w5[3] = __byte_perm_S (w2[3], w3[0], selector); + w5[2] = __byte_perm_S (w2[2], w2[3], selector); + w5[1] = __byte_perm_S (w2[1], w2[2], selector); + w5[0] = __byte_perm_S (w2[0], w2[1], selector); + w4[3] = __byte_perm_S (w1[3], w2[0], selector); + w4[2] = __byte_perm_S (w1[2], w1[3], selector); + w4[1] = __byte_perm_S (w1[1], w1[2], selector); + w4[0] = __byte_perm_S (w1[0], w1[1], selector); + w3[3] = __byte_perm_S (w0[3], w1[0], selector); + w3[2] = __byte_perm_S (w0[2], w0[3], selector); + w3[1] = __byte_perm_S (w0[1], w0[2], selector); + w3[0] = __byte_perm_S (w0[0], w0[1], selector); + w2[3] = __byte_perm_S ( 0, w0[0], selector); + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + case 12: + w7[3] = __byte_perm_S (w4[2], w4[3], selector); + w7[2] = __byte_perm_S (w4[1], w4[2], selector); + w7[1] = __byte_perm_S (w4[0], w4[1], selector); + w7[0] = __byte_perm_S (w3[3], w4[0], selector); + w6[3] = __byte_perm_S (w3[2], w3[3], selector); + w6[2] = __byte_perm_S (w3[1], w3[2], selector); + w6[1] = __byte_perm_S (w3[0], w3[1], selector); + w6[0] = __byte_perm_S (w2[3], w3[0], selector); + w5[3] = __byte_perm_S (w2[2], w2[3], selector); + w5[2] = __byte_perm_S (w2[1], w2[2], selector); + w5[1] = __byte_perm_S (w2[0], w2[1], selector); + w5[0] = __byte_perm_S (w1[3], w2[0], selector); + w4[3] = __byte_perm_S (w1[2], w1[3], selector); + w4[2] = __byte_perm_S (w1[1], w1[2], selector); + w4[1] = __byte_perm_S (w1[0], w1[1], selector); + w4[0] = __byte_perm_S (w0[3], w1[0], selector); + w3[3] = __byte_perm_S (w0[2], w0[3], selector); + w3[2] = __byte_perm_S (w0[1], w0[2], selector); + w3[1] = __byte_perm_S (w0[0], w0[1], selector); + w3[0] = __byte_perm_S ( 0, w0[0], selector); + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + case 13: + w7[3] = __byte_perm_S (w4[1], w4[2], selector); + w7[2] = __byte_perm_S (w4[0], w4[1], selector); + w7[1] = __byte_perm_S (w3[3], w4[0], selector); + w7[0] = __byte_perm_S (w3[2], w3[3], selector); + w6[3] = __byte_perm_S (w3[1], w3[2], selector); + w6[2] = __byte_perm_S (w3[0], w3[1], selector); + w6[1] = __byte_perm_S (w2[3], w3[0], selector); + w6[0] = __byte_perm_S (w2[2], w2[3], selector); + w5[3] = __byte_perm_S (w2[1], w2[2], selector); + w5[2] = __byte_perm_S (w2[0], w2[1], selector); + w5[1] = __byte_perm_S (w1[3], w2[0], selector); + w5[0] = __byte_perm_S (w1[2], w1[3], selector); + w4[3] = __byte_perm_S (w1[1], w1[2], selector); + w4[2] = __byte_perm_S (w1[0], w1[1], selector); + w4[1] = __byte_perm_S (w0[3], w1[0], selector); + w4[0] = __byte_perm_S (w0[2], w0[3], selector); + w3[3] = __byte_perm_S (w0[1], w0[2], selector); + w3[2] = __byte_perm_S (w0[0], w0[1], selector); + w3[1] = __byte_perm_S ( 0, w0[0], selector); + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + case 14: + w7[3] = __byte_perm_S (w4[0], w4[1], selector); + w7[2] = __byte_perm_S (w3[3], w4[0], selector); + w7[1] = __byte_perm_S (w3[2], w3[3], selector); + w7[0] = __byte_perm_S (w3[1], w3[2], selector); + w6[3] = __byte_perm_S (w3[0], w3[1], selector); + w6[2] = __byte_perm_S (w2[3], w3[0], selector); + w6[1] = __byte_perm_S (w2[2], w2[3], selector); + w6[0] = __byte_perm_S (w2[1], w2[2], selector); + w5[3] = __byte_perm_S (w2[0], w2[1], selector); + w5[2] = __byte_perm_S (w1[3], w2[0], selector); + w5[1] = __byte_perm_S (w1[2], w1[3], selector); + w5[0] = __byte_perm_S (w1[1], w1[2], selector); + w4[3] = __byte_perm_S (w1[0], w1[1], selector); + w4[2] = __byte_perm_S (w0[3], w1[0], selector); + w4[1] = __byte_perm_S (w0[2], w0[3], selector); + w4[0] = __byte_perm_S (w0[1], w0[2], selector); + w3[3] = __byte_perm_S (w0[0], w0[1], selector); + w3[2] = __byte_perm_S ( 0, w0[0], selector); + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + + case 15: + w7[3] = __byte_perm_S (w3[3], w4[0], selector); + w7[2] = __byte_perm_S (w3[2], w3[3], selector); + w7[1] = __byte_perm_S (w3[1], w3[2], selector); + w7[0] = __byte_perm_S (w3[0], w3[1], selector); + w6[3] = __byte_perm_S (w2[3], w3[0], selector); + w6[2] = __byte_perm_S (w2[2], w2[3], selector); + w6[1] = __byte_perm_S (w2[1], w2[2], selector); + w6[0] = __byte_perm_S (w2[0], w2[1], selector); + w5[3] = __byte_perm_S (w1[3], w2[0], selector); + w5[2] = __byte_perm_S (w1[2], w1[3], selector); + w5[1] = __byte_perm_S (w1[1], w1[2], selector); + w5[0] = __byte_perm_S (w1[0], w1[1], selector); + w4[3] = __byte_perm_S (w0[3], w1[0], selector); + w4[2] = __byte_perm_S (w0[2], w0[3], selector); + w4[1] = __byte_perm_S (w0[1], w0[2], selector); + w4[0] = __byte_perm_S (w0[0], w0[1], selector); + w3[3] = __byte_perm_S ( 0, w0[0], selector); + w3[2] = 0; + w3[1] = 0; + w3[0] = 0; + w2[3] = 0; + w2[2] = 0; + w2[1] = 0; + w2[0] = 0; + w1[3] = 0; + w1[2] = 0; + w1[1] = 0; + w1[0] = 0; + w0[3] = 0; + w0[2] = 0; + w0[1] = 0; + w0[0] = 0; + break; + } + #endif +} + inline void switch_buffer_by_offset_carry_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 c0[4], u32 c1[4], u32 c2[4], u32 c3[4], const u32 offset) { const int offset_mod_4 = offset & 3; @@ -8862,6 +10475,26 @@ inline void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], PACKSV4 (s2, v2, e); \ PACKSV4 (s3, v3, e); +#define PACKVS84(s0,s1,s2,s3,s4,s5,s6,s7,v0,v1,v2,v3,v4,v5,v6,v7,e) \ + PACKVS4 (s0, v0, e); \ + PACKVS4 (s1, v1, e); \ + PACKVS4 (s2, v2, e); \ + PACKVS4 (s3, v3, e); \ + PACKVS4 (s4, v4, e); \ + PACKVS4 (s5, v5, e); \ + PACKVS4 (s6, v6, e); \ + PACKVS4 (s7, v7, e); + +#define PACKSV84(s0,s1,s2,s3,s4,s5,s6,s7,v0,v1,v2,v3,v4,v5,v6,v7,e) \ + PACKSV4 (s0, v0, e); \ + PACKSV4 (s1, v1, e); \ + PACKSV4 (s2, v2, e); \ + PACKSV4 (s3, v3, e); \ + PACKSV4 (s4, v4, e); \ + PACKSV4 (s5, v5, e); \ + PACKSV4 (s6, v6, e); \ + PACKSV4 (s7, v7, e); + inline void switch_buffer_by_offset_le_VV (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x offset) { #if VECT_SIZE == 1 @@ -8922,6 +10555,186 @@ inline void switch_buffer_by_offset_le_VV (u32x w0[4], u32x w1[4], u32x w2[4], u #endif } +inline void switch_buffer_by_offset_8x4_le_VV (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32x offset) +{ + #if VECT_SIZE == 1 + + switch_buffer_by_offset_8x4_le_S (w0, w1, w2, w3, w4, w5, w6, w7, offset); + + #else + + u32 t0[4]; + u32 t1[4]; + u32 t2[4]; + u32 t3[4]; + u32 t4[4]; + u32 t5[4]; + u32 t6[4]; + u32 t7[4]; + + #endif + + #if VECT_SIZE == 2 + + // 1 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s0); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0); + + // 2 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s1); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1); + + #elif VECT_SIZE == 4 + + // 1 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s0); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0); + + // 2 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s1); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1); + + // 3 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 2); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s2); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 2); + + // 4 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 3); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s3); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 3); + + #elif VECT_SIZE == 8 + + // 1 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s0); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0); + + // 2 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s1); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1); + + // 3 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 2); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s2); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 2); + + // 4 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 3); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s3); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 3); + + // 5 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 4); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s4); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 4); + + // 6 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 5); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s5); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 5); + + // 7 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 6); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s6); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 6); + + // 8 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 7); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s7); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 7); + + #elif VECT_SIZE == 16 + + // 1 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s0); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0); + + // 2 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s1); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1); + + // 3 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 2); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s2); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 2); + + // 4 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 3); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s3); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 3); + + // 5 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 4); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s4); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 4); + + // 6 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 5); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s5); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 5); + + // 7 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 6); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s6); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 6); + + // 8 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 7); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s7); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 7); + + // 9 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 8); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s8); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 8); + + // 10 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 9); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s9); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 9); + + // 11 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, a); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.sa); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, a); + + // 12 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, b); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.sb); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, b); + + // 13 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, c); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.sc); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, c); + + // 14 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, d); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.sd); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, d); + + // 15 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, e); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.se); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, e); + + // 16 + PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, f); + switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.sf); + PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, f); + + #endif +} + inline void append_0x01_2x4_VV (u32x w0[4], u32x w1[4], const u32x offset) { #if VECT_SIZE == 1 diff --git a/OpenCL/m15000_a0.cl b/OpenCL/m15000_a0.cl index 25b75e40a..35a95f9ec 100644 --- a/OpenCL/m15000_a0.cl +++ b/OpenCL/m15000_a0.cl @@ -16,1819 +16,6 @@ #include "inc_rp.cl" #include "inc_simd.cl" -inline void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset) -{ - #if defined IS_AMD || defined IS_GENERIC - const int offset_mod_4 = offset & 3; - - const int offset_minus_4 = 4 - offset; - - switch (offset / 4) - { - case 0: - w7[3] = amd_bytealign_S (w7[3], w7[2], offset_minus_4); - w7[2] = amd_bytealign_S (w7[2], w7[1], offset_minus_4); - w7[1] = amd_bytealign_S (w7[1], w7[0], offset_minus_4); - w7[0] = amd_bytealign_S (w7[0], w6[3], offset_minus_4); - w6[3] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); - w6[2] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); - w6[1] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); - w6[0] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w5[3] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w5[2] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w5[1] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w5[0] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w4[3] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w4[2] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w4[1] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w4[0] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w3[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w3[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w3[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w2[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w2[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w2[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w1[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w1[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w1[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w0[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w0[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w0[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); - - if (offset_mod_4 == 0) - { - w0[0] = w0[1]; - w0[1] = w0[2]; - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - - break; - - case 1: - w7[3] = amd_bytealign_S (w7[2], w7[1], offset_minus_4); - w7[2] = amd_bytealign_S (w7[1], w7[0], offset_minus_4); - w7[1] = amd_bytealign_S (w7[0], w6[3], offset_minus_4); - w7[0] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); - w6[3] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); - w6[2] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); - w6[1] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w6[0] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w5[3] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w5[2] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w5[1] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w5[0] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w4[3] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w4[2] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w4[1] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w4[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w3[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w3[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w2[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w2[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w1[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w1[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w0[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w0[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w0[1] = w0[2]; - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - - break; - - case 2: - w7[3] = amd_bytealign_S (w7[1], w7[0], offset_minus_4); - w7[2] = amd_bytealign_S (w7[0], w6[3], offset_minus_4); - w7[1] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); - w7[0] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); - w6[3] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); - w6[2] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w6[1] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w6[0] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w5[3] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w5[2] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w5[1] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w5[0] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w4[3] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w4[2] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w4[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w4[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w3[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w2[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w1[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w0[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - - break; - - case 3: - w7[3] = amd_bytealign_S (w7[0], w6[3], offset_minus_4); - w7[2] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); - w7[1] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); - w7[0] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); - w6[3] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w6[2] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w6[1] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w6[0] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w5[3] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w5[2] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w5[1] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w5[0] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w4[3] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w4[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w4[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w4[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - - break; - - case 4: - w7[3] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); - w7[2] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); - w7[1] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); - w7[0] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w6[3] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w6[2] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w6[1] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w6[0] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w5[3] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w5[2] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w5[1] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w5[0] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w4[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w4[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w4[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w4[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - - break; - - case 5: - w7[3] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); - w7[2] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); - w7[1] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w7[0] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w6[3] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w6[2] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w6[1] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w6[0] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w5[3] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w5[2] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w5[1] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w5[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w4[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w4[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w4[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w4[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - - break; - - case 6: - w7[3] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); - w7[2] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w7[1] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w7[0] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w6[3] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w6[2] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w6[1] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w6[0] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w5[3] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w5[2] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w5[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w5[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w4[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w4[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w4[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w4[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - - break; - - case 7: - w7[3] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w7[2] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w7[1] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w7[0] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w6[3] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w6[2] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w6[1] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w6[0] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w5[3] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w5[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w5[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w5[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w4[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w4[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w4[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w4[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - - break; - - case 8: - w7[3] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w7[2] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w7[1] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w7[0] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w6[3] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w6[2] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w6[1] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w6[0] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w5[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w5[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w5[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w5[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w4[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w4[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w4[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w4[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - - break; - - case 9: - w7[3] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w7[2] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w7[1] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w7[0] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w6[3] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w6[2] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w6[1] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w6[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w5[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w5[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w5[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w5[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w4[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w4[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w4[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w4[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - - break; - - case 10: - w7[3] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w7[2] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w7[1] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w7[0] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w6[3] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w6[2] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w6[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w6[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w5[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w5[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w5[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w5[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w4[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w4[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w4[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w4[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - - break; - - case 11: - w7[3] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w7[2] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w7[1] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w7[0] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w6[3] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w6[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w6[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w6[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w5[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w5[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w5[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w5[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w4[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w4[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w4[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w4[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - - break; - - case 12: - w7[3] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w7[2] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w7[1] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w7[0] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w6[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w6[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w6[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w6[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w5[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w5[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w5[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w5[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w4[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w4[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w4[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w4[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - - break; - - case 13: - w7[3] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w7[2] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w7[1] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w7[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w6[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w6[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w6[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w6[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w5[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w5[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w5[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w5[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w4[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w4[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w4[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w4[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w3[0] = 0; - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - - break; - - case 14: - w7[3] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w7[2] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w7[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w7[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w6[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w6[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w6[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w6[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w5[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w5[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w5[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w5[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w4[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w4[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w4[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w4[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w3[1] = 0; - w3[0] = 0; - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - - break; - - case 15: - w7[3] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w7[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w7[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w7[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w6[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w6[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w6[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w6[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w5[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w5[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w5[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w5[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w4[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w4[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w4[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w4[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w3[2] = 0; - w3[1] = 0; - w3[0] = 0; - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - - break; - } - #endif - - #ifdef IS_NV - const int offset_minus_4 = 4 - (offset % 4); - - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - - switch (offset / 4) - { - case 0: - w7[3] = __byte_perm_S (w7[2], w7[3], selector); - w7[2] = __byte_perm_S (w7[1], w7[2], selector); - w7[1] = __byte_perm_S (w7[0], w7[1], selector); - w7[0] = __byte_perm_S (w6[3], w7[0], selector); - w6[3] = __byte_perm_S (w6[2], w6[3], selector); - w6[2] = __byte_perm_S (w6[1], w6[2], selector); - w6[1] = __byte_perm_S (w6[0], w6[1], selector); - w6[0] = __byte_perm_S (w5[3], w6[0], selector); - w5[3] = __byte_perm_S (w5[2], w5[3], selector); - w5[2] = __byte_perm_S (w5[1], w5[2], selector); - w5[1] = __byte_perm_S (w5[0], w5[1], selector); - w5[0] = __byte_perm_S (w4[3], w5[0], selector); - w4[3] = __byte_perm_S (w4[2], w4[3], selector); - w4[2] = __byte_perm_S (w4[1], w4[2], selector); - w4[1] = __byte_perm_S (w4[0], w4[1], selector); - w4[0] = __byte_perm_S (w3[3], w4[0], selector); - w3[3] = __byte_perm_S (w3[2], w3[3], selector); - w3[2] = __byte_perm_S (w3[1], w3[2], selector); - w3[1] = __byte_perm_S (w3[0], w3[1], selector); - w3[0] = __byte_perm_S (w2[3], w3[0], selector); - w2[3] = __byte_perm_S (w2[2], w2[3], selector); - w2[2] = __byte_perm_S (w2[1], w2[2], selector); - w2[1] = __byte_perm_S (w2[0], w2[1], selector); - w2[0] = __byte_perm_S (w1[3], w2[0], selector); - w1[3] = __byte_perm_S (w1[2], w1[3], selector); - w1[2] = __byte_perm_S (w1[1], w1[2], selector); - w1[1] = __byte_perm_S (w1[0], w1[1], selector); - w1[0] = __byte_perm_S (w0[3], w1[0], selector); - w0[3] = __byte_perm_S (w0[2], w0[3], selector); - w0[2] = __byte_perm_S (w0[1], w0[2], selector); - w0[1] = __byte_perm_S (w0[0], w0[1], selector); - w0[0] = __byte_perm_S ( 0, w0[0], selector); - break; - - case 1: - w7[3] = __byte_perm_S (w7[1], w7[2], selector); - w7[2] = __byte_perm_S (w7[0], w7[1], selector); - w7[1] = __byte_perm_S (w6[3], w7[0], selector); - w7[0] = __byte_perm_S (w6[2], w6[3], selector); - w6[3] = __byte_perm_S (w6[1], w6[2], selector); - w6[2] = __byte_perm_S (w6[0], w6[1], selector); - w6[1] = __byte_perm_S (w5[3], w6[0], selector); - w6[0] = __byte_perm_S (w5[2], w5[3], selector); - w5[3] = __byte_perm_S (w5[1], w5[2], selector); - w5[2] = __byte_perm_S (w5[0], w5[1], selector); - w5[1] = __byte_perm_S (w4[3], w5[0], selector); - w5[0] = __byte_perm_S (w4[2], w4[3], selector); - w4[3] = __byte_perm_S (w4[1], w4[2], selector); - w4[2] = __byte_perm_S (w4[0], w4[1], selector); - w4[1] = __byte_perm_S (w3[3], w4[0], selector); - w4[0] = __byte_perm_S (w3[2], w3[3], selector); - w3[3] = __byte_perm_S (w3[1], w3[2], selector); - w3[2] = __byte_perm_S (w3[0], w3[1], selector); - w3[1] = __byte_perm_S (w2[3], w3[0], selector); - w3[0] = __byte_perm_S (w2[2], w2[3], selector); - w2[3] = __byte_perm_S (w2[1], w2[2], selector); - w2[2] = __byte_perm_S (w2[0], w2[1], selector); - w2[1] = __byte_perm_S (w1[3], w2[0], selector); - w2[0] = __byte_perm_S (w1[2], w1[3], selector); - w1[3] = __byte_perm_S (w1[1], w1[2], selector); - w1[2] = __byte_perm_S (w1[0], w1[1], selector); - w1[1] = __byte_perm_S (w0[3], w1[0], selector); - w1[0] = __byte_perm_S (w0[2], w0[3], selector); - w0[3] = __byte_perm_S (w0[1], w0[2], selector); - w0[2] = __byte_perm_S (w0[0], w0[1], selector); - w0[1] = __byte_perm_S ( 0, w0[0], selector); - w0[0] = 0; - break; - - case 2: - w7[3] = __byte_perm_S (w7[0], w7[1], selector); - w7[2] = __byte_perm_S (w6[3], w7[0], selector); - w7[1] = __byte_perm_S (w6[2], w6[3], selector); - w7[0] = __byte_perm_S (w6[1], w6[2], selector); - w6[3] = __byte_perm_S (w6[0], w6[1], selector); - w6[2] = __byte_perm_S (w5[3], w6[0], selector); - w6[1] = __byte_perm_S (w5[2], w5[3], selector); - w6[0] = __byte_perm_S (w5[1], w5[2], selector); - w5[3] = __byte_perm_S (w5[0], w5[1], selector); - w5[2] = __byte_perm_S (w4[3], w5[0], selector); - w5[1] = __byte_perm_S (w4[2], w4[3], selector); - w5[0] = __byte_perm_S (w4[1], w4[2], selector); - w4[3] = __byte_perm_S (w4[0], w4[1], selector); - w4[2] = __byte_perm_S (w3[3], w4[0], selector); - w4[1] = __byte_perm_S (w3[2], w3[3], selector); - w4[0] = __byte_perm_S (w3[1], w3[2], selector); - w3[3] = __byte_perm_S (w3[0], w3[1], selector); - w3[2] = __byte_perm_S (w2[3], w3[0], selector); - w3[1] = __byte_perm_S (w2[2], w2[3], selector); - w3[0] = __byte_perm_S (w2[1], w2[2], selector); - w2[3] = __byte_perm_S (w2[0], w2[1], selector); - w2[2] = __byte_perm_S (w1[3], w2[0], selector); - w2[1] = __byte_perm_S (w1[2], w1[3], selector); - w2[0] = __byte_perm_S (w1[1], w1[2], selector); - w1[3] = __byte_perm_S (w1[0], w1[1], selector); - w1[2] = __byte_perm_S (w0[3], w1[0], selector); - w1[1] = __byte_perm_S (w0[2], w0[3], selector); - w1[0] = __byte_perm_S (w0[1], w0[2], selector); - w0[3] = __byte_perm_S (w0[0], w0[1], selector); - w0[2] = __byte_perm_S ( 0, w0[0], selector); - w0[1] = 0; - w0[0] = 0; - break; - - case 3: - w7[3] = __byte_perm_S (w6[3], w7[0], selector); - w7[2] = __byte_perm_S (w6[2], w6[3], selector); - w7[1] = __byte_perm_S (w6[1], w6[2], selector); - w7[0] = __byte_perm_S (w6[0], w6[1], selector); - w6[3] = __byte_perm_S (w5[3], w6[0], selector); - w6[2] = __byte_perm_S (w5[2], w5[3], selector); - w6[1] = __byte_perm_S (w5[1], w5[2], selector); - w6[0] = __byte_perm_S (w5[0], w5[1], selector); - w5[3] = __byte_perm_S (w4[3], w5[0], selector); - w5[2] = __byte_perm_S (w4[2], w4[3], selector); - w5[1] = __byte_perm_S (w4[1], w4[2], selector); - w5[0] = __byte_perm_S (w4[0], w4[1], selector); - w4[3] = __byte_perm_S (w3[3], w4[0], selector); - w4[2] = __byte_perm_S (w3[2], w3[3], selector); - w4[1] = __byte_perm_S (w3[1], w3[2], selector); - w4[0] = __byte_perm_S (w3[0], w3[1], selector); - w3[3] = __byte_perm_S (w2[3], w3[0], selector); - w3[2] = __byte_perm_S (w2[2], w2[3], selector); - w3[1] = __byte_perm_S (w2[1], w2[2], selector); - w3[0] = __byte_perm_S (w2[0], w2[1], selector); - w2[3] = __byte_perm_S (w1[3], w2[0], selector); - w2[2] = __byte_perm_S (w1[2], w1[3], selector); - w2[1] = __byte_perm_S (w1[1], w1[2], selector); - w2[0] = __byte_perm_S (w1[0], w1[1], selector); - w1[3] = __byte_perm_S (w0[3], w1[0], selector); - w1[2] = __byte_perm_S (w0[2], w0[3], selector); - w1[1] = __byte_perm_S (w0[1], w0[2], selector); - w1[0] = __byte_perm_S (w0[0], w0[1], selector); - w0[3] = __byte_perm_S ( 0, w0[0], selector); - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 4: - w7[3] = __byte_perm_S (w6[2], w6[3], selector); - w7[2] = __byte_perm_S (w6[1], w6[2], selector); - w7[1] = __byte_perm_S (w6[0], w6[1], selector); - w7[0] = __byte_perm_S (w5[3], w6[0], selector); - w6[3] = __byte_perm_S (w5[2], w5[3], selector); - w6[2] = __byte_perm_S (w5[1], w5[2], selector); - w6[1] = __byte_perm_S (w5[0], w5[1], selector); - w6[0] = __byte_perm_S (w4[3], w5[0], selector); - w5[3] = __byte_perm_S (w4[2], w4[3], selector); - w5[2] = __byte_perm_S (w4[1], w4[2], selector); - w5[1] = __byte_perm_S (w4[0], w4[1], selector); - w5[0] = __byte_perm_S (w3[3], w4[0], selector); - w4[3] = __byte_perm_S (w3[2], w3[3], selector); - w4[2] = __byte_perm_S (w3[1], w3[2], selector); - w4[1] = __byte_perm_S (w3[0], w3[1], selector); - w4[0] = __byte_perm_S (w2[3], w3[0], selector); - w3[3] = __byte_perm_S (w2[2], w2[3], selector); - w3[2] = __byte_perm_S (w2[1], w2[2], selector); - w3[1] = __byte_perm_S (w2[0], w2[1], selector); - w3[0] = __byte_perm_S (w1[3], w2[0], selector); - w2[3] = __byte_perm_S (w1[2], w1[3], selector); - w2[2] = __byte_perm_S (w1[1], w1[2], selector); - w2[1] = __byte_perm_S (w1[0], w1[1], selector); - w2[0] = __byte_perm_S (w0[3], w1[0], selector); - w1[3] = __byte_perm_S (w0[2], w0[3], selector); - w1[2] = __byte_perm_S (w0[1], w0[2], selector); - w1[1] = __byte_perm_S (w0[0], w0[1], selector); - w1[0] = __byte_perm_S ( 0, w0[0], selector); - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 5: - w7[3] = __byte_perm_S (w6[1], w6[2], selector); - w7[2] = __byte_perm_S (w6[0], w6[1], selector); - w7[1] = __byte_perm_S (w5[3], w6[0], selector); - w7[0] = __byte_perm_S (w5[2], w5[3], selector); - w6[3] = __byte_perm_S (w5[1], w5[2], selector); - w6[2] = __byte_perm_S (w5[0], w5[1], selector); - w6[1] = __byte_perm_S (w4[3], w5[0], selector); - w6[0] = __byte_perm_S (w4[2], w4[3], selector); - w5[3] = __byte_perm_S (w4[1], w4[2], selector); - w5[2] = __byte_perm_S (w4[0], w4[1], selector); - w5[1] = __byte_perm_S (w3[3], w4[0], selector); - w5[0] = __byte_perm_S (w3[2], w3[3], selector); - w4[3] = __byte_perm_S (w3[1], w3[2], selector); - w4[2] = __byte_perm_S (w3[0], w3[1], selector); - w4[1] = __byte_perm_S (w2[3], w3[0], selector); - w4[0] = __byte_perm_S (w2[2], w2[3], selector); - w3[3] = __byte_perm_S (w2[1], w2[2], selector); - w3[2] = __byte_perm_S (w2[0], w2[1], selector); - w3[1] = __byte_perm_S (w1[3], w2[0], selector); - w3[0] = __byte_perm_S (w1[2], w1[3], selector); - w2[3] = __byte_perm_S (w1[1], w1[2], selector); - w2[2] = __byte_perm_S (w1[0], w1[1], selector); - w2[1] = __byte_perm_S (w0[3], w1[0], selector); - w2[0] = __byte_perm_S (w0[2], w0[3], selector); - w1[3] = __byte_perm_S (w0[1], w0[2], selector); - w1[2] = __byte_perm_S (w0[0], w0[1], selector); - w1[1] = __byte_perm_S ( 0, w0[0], selector); - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 6: - w7[3] = __byte_perm_S (w6[0], w6[1], selector); - w7[2] = __byte_perm_S (w5[3], w6[0], selector); - w7[1] = __byte_perm_S (w5[2], w5[3], selector); - w7[0] = __byte_perm_S (w5[1], w5[2], selector); - w6[3] = __byte_perm_S (w5[0], w5[1], selector); - w6[2] = __byte_perm_S (w4[3], w5[0], selector); - w6[1] = __byte_perm_S (w4[2], w4[3], selector); - w6[0] = __byte_perm_S (w4[1], w4[2], selector); - w5[3] = __byte_perm_S (w4[0], w4[1], selector); - w5[2] = __byte_perm_S (w3[3], w4[0], selector); - w5[1] = __byte_perm_S (w3[2], w3[3], selector); - w5[0] = __byte_perm_S (w3[1], w3[2], selector); - w4[3] = __byte_perm_S (w3[0], w3[1], selector); - w4[2] = __byte_perm_S (w2[3], w3[0], selector); - w4[1] = __byte_perm_S (w2[2], w2[3], selector); - w4[0] = __byte_perm_S (w2[1], w2[2], selector); - w3[3] = __byte_perm_S (w2[0], w2[1], selector); - w3[2] = __byte_perm_S (w1[3], w2[0], selector); - w3[1] = __byte_perm_S (w1[2], w1[3], selector); - w3[0] = __byte_perm_S (w1[1], w1[2], selector); - w2[3] = __byte_perm_S (w1[0], w1[1], selector); - w2[2] = __byte_perm_S (w0[3], w1[0], selector); - w2[1] = __byte_perm_S (w0[2], w0[3], selector); - w2[0] = __byte_perm_S (w0[1], w0[2], selector); - w1[3] = __byte_perm_S (w0[0], w0[1], selector); - w1[2] = __byte_perm_S ( 0, w0[0], selector); - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 7: - w7[3] = __byte_perm_S (w5[3], w6[0], selector); - w7[2] = __byte_perm_S (w5[2], w5[3], selector); - w7[1] = __byte_perm_S (w5[1], w5[2], selector); - w7[0] = __byte_perm_S (w5[0], w5[1], selector); - w6[3] = __byte_perm_S (w4[3], w5[0], selector); - w6[2] = __byte_perm_S (w4[2], w4[3], selector); - w6[1] = __byte_perm_S (w4[1], w4[2], selector); - w6[0] = __byte_perm_S (w4[0], w4[1], selector); - w5[3] = __byte_perm_S (w3[3], w4[0], selector); - w5[2] = __byte_perm_S (w3[2], w3[3], selector); - w5[1] = __byte_perm_S (w3[1], w3[2], selector); - w5[0] = __byte_perm_S (w3[0], w3[1], selector); - w4[3] = __byte_perm_S (w2[3], w3[0], selector); - w4[2] = __byte_perm_S (w2[2], w2[3], selector); - w4[1] = __byte_perm_S (w2[1], w2[2], selector); - w4[0] = __byte_perm_S (w2[0], w2[1], selector); - w3[3] = __byte_perm_S (w1[3], w2[0], selector); - w3[2] = __byte_perm_S (w1[2], w1[3], selector); - w3[1] = __byte_perm_S (w1[1], w1[2], selector); - w3[0] = __byte_perm_S (w1[0], w1[1], selector); - w2[3] = __byte_perm_S (w0[3], w1[0], selector); - w2[2] = __byte_perm_S (w0[2], w0[3], selector); - w2[1] = __byte_perm_S (w0[1], w0[2], selector); - w2[0] = __byte_perm_S (w0[0], w0[1], selector); - w1[3] = __byte_perm_S ( 0, w0[0], selector); - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 8: - w7[3] = __byte_perm_S (w5[2], w5[3], selector); - w7[2] = __byte_perm_S (w5[1], w5[2], selector); - w7[1] = __byte_perm_S (w5[0], w5[1], selector); - w7[0] = __byte_perm_S (w4[3], w5[0], selector); - w6[3] = __byte_perm_S (w4[2], w4[3], selector); - w6[2] = __byte_perm_S (w4[1], w4[2], selector); - w6[1] = __byte_perm_S (w4[0], w4[1], selector); - w6[0] = __byte_perm_S (w3[3], w4[0], selector); - w5[3] = __byte_perm_S (w3[2], w3[3], selector); - w5[2] = __byte_perm_S (w3[1], w3[2], selector); - w5[1] = __byte_perm_S (w3[0], w3[1], selector); - w5[0] = __byte_perm_S (w2[3], w3[0], selector); - w4[3] = __byte_perm_S (w2[2], w2[3], selector); - w4[2] = __byte_perm_S (w2[1], w2[2], selector); - w4[1] = __byte_perm_S (w2[0], w2[1], selector); - w4[0] = __byte_perm_S (w1[3], w2[0], selector); - w3[3] = __byte_perm_S (w1[2], w1[3], selector); - w3[2] = __byte_perm_S (w1[1], w1[2], selector); - w3[1] = __byte_perm_S (w1[0], w1[1], selector); - w3[0] = __byte_perm_S (w0[3], w1[0], selector); - w2[3] = __byte_perm_S (w0[2], w0[3], selector); - w2[2] = __byte_perm_S (w0[1], w0[2], selector); - w2[1] = __byte_perm_S (w0[0], w0[1], selector); - w2[0] = __byte_perm_S ( 0, w0[0], selector); - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 9: - w7[3] = __byte_perm_S (w5[1], w5[2], selector); - w7[2] = __byte_perm_S (w5[0], w5[1], selector); - w7[1] = __byte_perm_S (w4[3], w5[0], selector); - w7[0] = __byte_perm_S (w4[2], w4[3], selector); - w6[3] = __byte_perm_S (w4[1], w4[2], selector); - w6[2] = __byte_perm_S (w4[0], w4[1], selector); - w6[1] = __byte_perm_S (w3[3], w4[0], selector); - w6[0] = __byte_perm_S (w3[2], w3[3], selector); - w5[3] = __byte_perm_S (w3[1], w3[2], selector); - w5[2] = __byte_perm_S (w3[0], w3[1], selector); - w5[1] = __byte_perm_S (w2[3], w3[0], selector); - w5[0] = __byte_perm_S (w2[2], w2[3], selector); - w4[3] = __byte_perm_S (w2[1], w2[2], selector); - w4[2] = __byte_perm_S (w2[0], w2[1], selector); - w4[1] = __byte_perm_S (w1[3], w2[0], selector); - w4[0] = __byte_perm_S (w1[2], w1[3], selector); - w3[3] = __byte_perm_S (w1[1], w1[2], selector); - w3[2] = __byte_perm_S (w1[0], w1[1], selector); - w3[1] = __byte_perm_S (w0[3], w1[0], selector); - w3[0] = __byte_perm_S (w0[2], w0[3], selector); - w2[3] = __byte_perm_S (w0[1], w0[2], selector); - w2[2] = __byte_perm_S (w0[0], w0[1], selector); - w2[1] = __byte_perm_S ( 0, w0[0], selector); - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 10: - w7[3] = __byte_perm_S (w5[0], w5[1], selector); - w7[2] = __byte_perm_S (w4[3], w5[0], selector); - w7[1] = __byte_perm_S (w4[2], w4[3], selector); - w7[0] = __byte_perm_S (w4[1], w4[2], selector); - w6[3] = __byte_perm_S (w4[0], w4[1], selector); - w6[2] = __byte_perm_S (w3[3], w4[0], selector); - w6[1] = __byte_perm_S (w3[2], w3[3], selector); - w6[0] = __byte_perm_S (w3[1], w3[2], selector); - w5[3] = __byte_perm_S (w3[0], w3[1], selector); - w5[2] = __byte_perm_S (w2[3], w3[0], selector); - w5[1] = __byte_perm_S (w2[2], w2[3], selector); - w5[0] = __byte_perm_S (w2[1], w2[2], selector); - w4[3] = __byte_perm_S (w2[0], w2[1], selector); - w4[2] = __byte_perm_S (w1[3], w2[0], selector); - w4[1] = __byte_perm_S (w1[2], w1[3], selector); - w4[0] = __byte_perm_S (w1[1], w1[2], selector); - w3[3] = __byte_perm_S (w1[0], w1[1], selector); - w3[2] = __byte_perm_S (w0[3], w1[0], selector); - w3[1] = __byte_perm_S (w0[2], w0[3], selector); - w3[0] = __byte_perm_S (w0[1], w0[2], selector); - w2[3] = __byte_perm_S (w0[0], w0[1], selector); - w2[2] = __byte_perm_S ( 0, w0[0], selector); - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 11: - w7[3] = __byte_perm_S (w4[3], w5[0], selector); - w7[2] = __byte_perm_S (w4[2], w4[3], selector); - w7[1] = __byte_perm_S (w4[1], w4[2], selector); - w7[0] = __byte_perm_S (w4[0], w4[1], selector); - w6[3] = __byte_perm_S (w3[3], w4[0], selector); - w6[2] = __byte_perm_S (w3[2], w3[3], selector); - w6[1] = __byte_perm_S (w3[1], w3[2], selector); - w6[0] = __byte_perm_S (w3[0], w3[1], selector); - w5[3] = __byte_perm_S (w2[3], w3[0], selector); - w5[2] = __byte_perm_S (w2[2], w2[3], selector); - w5[1] = __byte_perm_S (w2[1], w2[2], selector); - w5[0] = __byte_perm_S (w2[0], w2[1], selector); - w4[3] = __byte_perm_S (w1[3], w2[0], selector); - w4[2] = __byte_perm_S (w1[2], w1[3], selector); - w4[1] = __byte_perm_S (w1[1], w1[2], selector); - w4[0] = __byte_perm_S (w1[0], w1[1], selector); - w3[3] = __byte_perm_S (w0[3], w1[0], selector); - w3[2] = __byte_perm_S (w0[2], w0[3], selector); - w3[1] = __byte_perm_S (w0[1], w0[2], selector); - w3[0] = __byte_perm_S (w0[0], w0[1], selector); - w2[3] = __byte_perm_S ( 0, w0[0], selector); - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 12: - w7[3] = __byte_perm_S (w4[2], w4[3], selector); - w7[2] = __byte_perm_S (w4[1], w4[2], selector); - w7[1] = __byte_perm_S (w4[0], w4[1], selector); - w7[0] = __byte_perm_S (w3[3], w4[0], selector); - w6[3] = __byte_perm_S (w3[2], w3[3], selector); - w6[2] = __byte_perm_S (w3[1], w3[2], selector); - w6[1] = __byte_perm_S (w3[0], w3[1], selector); - w6[0] = __byte_perm_S (w2[3], w3[0], selector); - w5[3] = __byte_perm_S (w2[2], w2[3], selector); - w5[2] = __byte_perm_S (w2[1], w2[2], selector); - w5[1] = __byte_perm_S (w2[0], w2[1], selector); - w5[0] = __byte_perm_S (w1[3], w2[0], selector); - w4[3] = __byte_perm_S (w1[2], w1[3], selector); - w4[2] = __byte_perm_S (w1[1], w1[2], selector); - w4[1] = __byte_perm_S (w1[0], w1[1], selector); - w4[0] = __byte_perm_S (w0[3], w1[0], selector); - w3[3] = __byte_perm_S (w0[2], w0[3], selector); - w3[2] = __byte_perm_S (w0[1], w0[2], selector); - w3[1] = __byte_perm_S (w0[0], w0[1], selector); - w3[0] = __byte_perm_S ( 0, w0[0], selector); - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 13: - w7[3] = __byte_perm_S (w4[1], w4[2], selector); - w7[2] = __byte_perm_S (w4[0], w4[1], selector); - w7[1] = __byte_perm_S (w3[3], w4[0], selector); - w7[0] = __byte_perm_S (w3[2], w3[3], selector); - w6[3] = __byte_perm_S (w3[1], w3[2], selector); - w6[2] = __byte_perm_S (w3[0], w3[1], selector); - w6[1] = __byte_perm_S (w2[3], w3[0], selector); - w6[0] = __byte_perm_S (w2[2], w2[3], selector); - w5[3] = __byte_perm_S (w2[1], w2[2], selector); - w5[2] = __byte_perm_S (w2[0], w2[1], selector); - w5[1] = __byte_perm_S (w1[3], w2[0], selector); - w5[0] = __byte_perm_S (w1[2], w1[3], selector); - w4[3] = __byte_perm_S (w1[1], w1[2], selector); - w4[2] = __byte_perm_S (w1[0], w1[1], selector); - w4[1] = __byte_perm_S (w0[3], w1[0], selector); - w4[0] = __byte_perm_S (w0[2], w0[3], selector); - w3[3] = __byte_perm_S (w0[1], w0[2], selector); - w3[2] = __byte_perm_S (w0[0], w0[1], selector); - w3[1] = __byte_perm_S ( 0, w0[0], selector); - w3[0] = 0; - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 14: - w7[3] = __byte_perm_S (w4[0], w4[1], selector); - w7[2] = __byte_perm_S (w3[3], w4[0], selector); - w7[1] = __byte_perm_S (w3[2], w3[3], selector); - w7[0] = __byte_perm_S (w3[1], w3[2], selector); - w6[3] = __byte_perm_S (w3[0], w3[1], selector); - w6[2] = __byte_perm_S (w2[3], w3[0], selector); - w6[1] = __byte_perm_S (w2[2], w2[3], selector); - w6[0] = __byte_perm_S (w2[1], w2[2], selector); - w5[3] = __byte_perm_S (w2[0], w2[1], selector); - w5[2] = __byte_perm_S (w1[3], w2[0], selector); - w5[1] = __byte_perm_S (w1[2], w1[3], selector); - w5[0] = __byte_perm_S (w1[1], w1[2], selector); - w4[3] = __byte_perm_S (w1[0], w1[1], selector); - w4[2] = __byte_perm_S (w0[3], w1[0], selector); - w4[1] = __byte_perm_S (w0[2], w0[3], selector); - w4[0] = __byte_perm_S (w0[1], w0[2], selector); - w3[3] = __byte_perm_S (w0[0], w0[1], selector); - w3[2] = __byte_perm_S ( 0, w0[0], selector); - w3[1] = 0; - w3[0] = 0; - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 15: - w7[3] = __byte_perm_S (w3[3], w4[0], selector); - w7[2] = __byte_perm_S (w3[2], w3[3], selector); - w7[1] = __byte_perm_S (w3[1], w3[2], selector); - w7[0] = __byte_perm_S (w3[0], w3[1], selector); - w6[3] = __byte_perm_S (w2[3], w3[0], selector); - w6[2] = __byte_perm_S (w2[2], w2[3], selector); - w6[1] = __byte_perm_S (w2[1], w2[2], selector); - w6[0] = __byte_perm_S (w2[0], w2[1], selector); - w5[3] = __byte_perm_S (w1[3], w2[0], selector); - w5[2] = __byte_perm_S (w1[2], w1[3], selector); - w5[1] = __byte_perm_S (w1[1], w1[2], selector); - w5[0] = __byte_perm_S (w1[0], w1[1], selector); - w4[3] = __byte_perm_S (w0[3], w1[0], selector); - w4[2] = __byte_perm_S (w0[2], w0[3], selector); - w4[1] = __byte_perm_S (w0[1], w0[2], selector); - w4[0] = __byte_perm_S (w0[0], w0[1], selector); - w3[3] = __byte_perm_S ( 0, w0[0], selector); - w3[2] = 0; - w3[1] = 0; - w3[0] = 0; - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - } - #endif -} - -#define PACKVS84(s0,s1,s2,s3,s4,s5,s6,s7,v0,v1,v2,v3,v4,v5,v6,v7,e) \ - PACKVS4 (s0, v0, e); \ - PACKVS4 (s1, v1, e); \ - PACKVS4 (s2, v2, e); \ - PACKVS4 (s3, v3, e); \ - PACKVS4 (s4, v4, e); \ - PACKVS4 (s5, v5, e); \ - PACKVS4 (s6, v6, e); \ - PACKVS4 (s7, v7, e); - -#define PACKSV84(s0,s1,s2,s3,s4,s5,s6,s7,v0,v1,v2,v3,v4,v5,v6,v7,e) \ - PACKSV4 (s0, v0, e); \ - PACKSV4 (s1, v1, e); \ - PACKSV4 (s2, v2, e); \ - PACKSV4 (s3, v3, e); \ - PACKSV4 (s4, v4, e); \ - PACKSV4 (s5, v5, e); \ - PACKSV4 (s6, v6, e); \ - PACKSV4 (s7, v7, e); - -inline void switch_buffer_by_offset_8x4_le_VV (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32x offset) -{ - #if VECT_SIZE == 1 - - switch_buffer_by_offset_8x4_le_S (w0, w1, w2, w3, w4, w5, w6, w7, offset); - - #else - - u32 t0[4]; - u32 t1[4]; - u32 t2[4]; - u32 t3[4]; - u32 t4[4]; - u32 t5[4]; - u32 t6[4]; - u32 t7[4]; - - #endif - - #if VECT_SIZE == 2 - - // 1 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s0); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0); - - // 2 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s1); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1); - - #elif VECT_SIZE == 4 - - // 1 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s0); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0); - - // 2 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s1); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1); - - // 3 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 2); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s2); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 2); - - // 4 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 3); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s3); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 3); - - #elif VECT_SIZE == 8 - - // 1 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s0); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0); - - // 2 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s1); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1); - - // 3 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 2); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s2); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 2); - - // 4 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 3); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s3); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 3); - - // 5 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 4); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s4); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 4); - - // 6 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 5); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s5); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 5); - - // 7 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 6); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s6); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 6); - - // 8 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 7); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s7); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 7); - - #elif VECT_SIZE == 16 - - // 1 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s0); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0); - - // 2 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s1); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1); - - // 3 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 2); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s2); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 2); - - // 4 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 3); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s3); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 3); - - // 5 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 4); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s4); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 4); - - // 6 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 5); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s5); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 5); - - // 7 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 6); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s6); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 6); - - // 8 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 7); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s7); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 7); - - // 9 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 8); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s8); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 8); - - // 10 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 9); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s9); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 9); - - // 11 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, a); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.sa); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, a); - - // 12 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, b); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.sb); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, b); - - // 13 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, c); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.sc); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, c); - - // 14 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, d); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.sd); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, d); - - // 15 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, e); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.se); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, e); - - // 16 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, f); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.sf); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, f); - - #endif -} - __constant u64 k_sha512[80] = { SHA512C00, SHA512C01, SHA512C02, SHA512C03, diff --git a/OpenCL/m15000_a1.cl b/OpenCL/m15000_a1.cl index 86274b263..86f3270eb 100644 --- a/OpenCL/m15000_a1.cl +++ b/OpenCL/m15000_a1.cl @@ -14,1819 +14,6 @@ #include "inc_common.cl" #include "inc_simd.cl" -inline void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset) -{ - #if defined IS_AMD || defined IS_GENERIC - const int offset_mod_4 = offset & 3; - - const int offset_minus_4 = 4 - offset; - - switch (offset / 4) - { - case 0: - w7[3] = amd_bytealign_S (w7[3], w7[2], offset_minus_4); - w7[2] = amd_bytealign_S (w7[2], w7[1], offset_minus_4); - w7[1] = amd_bytealign_S (w7[1], w7[0], offset_minus_4); - w7[0] = amd_bytealign_S (w7[0], w6[3], offset_minus_4); - w6[3] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); - w6[2] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); - w6[1] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); - w6[0] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w5[3] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w5[2] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w5[1] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w5[0] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w4[3] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w4[2] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w4[1] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w4[0] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w3[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w3[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w3[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w2[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w2[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w2[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w1[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w1[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w1[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w0[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w0[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w0[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); - - if (offset_mod_4 == 0) - { - w0[0] = w0[1]; - w0[1] = w0[2]; - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - - break; - - case 1: - w7[3] = amd_bytealign_S (w7[2], w7[1], offset_minus_4); - w7[2] = amd_bytealign_S (w7[1], w7[0], offset_minus_4); - w7[1] = amd_bytealign_S (w7[0], w6[3], offset_minus_4); - w7[0] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); - w6[3] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); - w6[2] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); - w6[1] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w6[0] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w5[3] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w5[2] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w5[1] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w5[0] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w4[3] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w4[2] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w4[1] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w4[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w3[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w3[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w2[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w2[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w1[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w1[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w0[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w0[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w0[1] = w0[2]; - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - - break; - - case 2: - w7[3] = amd_bytealign_S (w7[1], w7[0], offset_minus_4); - w7[2] = amd_bytealign_S (w7[0], w6[3], offset_minus_4); - w7[1] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); - w7[0] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); - w6[3] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); - w6[2] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w6[1] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w6[0] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w5[3] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w5[2] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w5[1] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w5[0] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w4[3] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w4[2] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w4[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w4[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w3[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w2[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w1[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w0[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - - break; - - case 3: - w7[3] = amd_bytealign_S (w7[0], w6[3], offset_minus_4); - w7[2] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); - w7[1] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); - w7[0] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); - w6[3] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w6[2] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w6[1] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w6[0] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w5[3] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w5[2] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w5[1] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w5[0] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w4[3] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w4[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w4[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w4[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - - break; - - case 4: - w7[3] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); - w7[2] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); - w7[1] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); - w7[0] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w6[3] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w6[2] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w6[1] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w6[0] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w5[3] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w5[2] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w5[1] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w5[0] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w4[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w4[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w4[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w4[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - - break; - - case 5: - w7[3] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); - w7[2] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); - w7[1] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w7[0] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w6[3] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w6[2] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w6[1] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w6[0] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w5[3] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w5[2] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w5[1] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w5[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w4[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w4[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w4[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w4[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - - break; - - case 6: - w7[3] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); - w7[2] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w7[1] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w7[0] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w6[3] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w6[2] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w6[1] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w6[0] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w5[3] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w5[2] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w5[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w5[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w4[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w4[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w4[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w4[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - - break; - - case 7: - w7[3] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w7[2] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w7[1] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w7[0] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w6[3] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w6[2] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w6[1] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w6[0] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w5[3] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w5[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w5[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w5[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w4[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w4[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w4[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w4[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - - break; - - case 8: - w7[3] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w7[2] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w7[1] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w7[0] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w6[3] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w6[2] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w6[1] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w6[0] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w5[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w5[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w5[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w5[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w4[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w4[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w4[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w4[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - - break; - - case 9: - w7[3] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w7[2] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w7[1] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w7[0] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w6[3] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w6[2] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w6[1] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w6[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w5[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w5[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w5[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w5[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w4[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w4[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w4[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w4[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - - break; - - case 10: - w7[3] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w7[2] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w7[1] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w7[0] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w6[3] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w6[2] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w6[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w6[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w5[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w5[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w5[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w5[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w4[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w4[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w4[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w4[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - - break; - - case 11: - w7[3] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w7[2] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w7[1] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w7[0] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w6[3] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w6[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w6[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w6[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w5[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w5[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w5[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w5[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w4[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w4[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w4[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w4[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - - break; - - case 12: - w7[3] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w7[2] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w7[1] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w7[0] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w6[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w6[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w6[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w6[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w5[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w5[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w5[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w5[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w4[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w4[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w4[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w4[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - - break; - - case 13: - w7[3] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w7[2] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w7[1] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w7[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w6[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w6[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w6[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w6[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w5[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w5[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w5[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w5[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w4[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w4[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w4[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w4[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w3[0] = 0; - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - - break; - - case 14: - w7[3] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w7[2] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w7[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w7[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w6[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w6[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w6[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w6[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w5[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w5[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w5[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w5[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w4[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w4[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w4[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w4[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w3[1] = 0; - w3[0] = 0; - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - - break; - - case 15: - w7[3] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w7[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w7[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w7[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w6[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w6[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w6[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w6[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w5[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w5[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w5[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w5[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w4[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w4[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w4[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w4[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w3[2] = 0; - w3[1] = 0; - w3[0] = 0; - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - - break; - } - #endif - - #ifdef IS_NV - const int offset_minus_4 = 4 - (offset % 4); - - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - - switch (offset / 4) - { - case 0: - w7[3] = __byte_perm_S (w7[2], w7[3], selector); - w7[2] = __byte_perm_S (w7[1], w7[2], selector); - w7[1] = __byte_perm_S (w7[0], w7[1], selector); - w7[0] = __byte_perm_S (w6[3], w7[0], selector); - w6[3] = __byte_perm_S (w6[2], w6[3], selector); - w6[2] = __byte_perm_S (w6[1], w6[2], selector); - w6[1] = __byte_perm_S (w6[0], w6[1], selector); - w6[0] = __byte_perm_S (w5[3], w6[0], selector); - w5[3] = __byte_perm_S (w5[2], w5[3], selector); - w5[2] = __byte_perm_S (w5[1], w5[2], selector); - w5[1] = __byte_perm_S (w5[0], w5[1], selector); - w5[0] = __byte_perm_S (w4[3], w5[0], selector); - w4[3] = __byte_perm_S (w4[2], w4[3], selector); - w4[2] = __byte_perm_S (w4[1], w4[2], selector); - w4[1] = __byte_perm_S (w4[0], w4[1], selector); - w4[0] = __byte_perm_S (w3[3], w4[0], selector); - w3[3] = __byte_perm_S (w3[2], w3[3], selector); - w3[2] = __byte_perm_S (w3[1], w3[2], selector); - w3[1] = __byte_perm_S (w3[0], w3[1], selector); - w3[0] = __byte_perm_S (w2[3], w3[0], selector); - w2[3] = __byte_perm_S (w2[2], w2[3], selector); - w2[2] = __byte_perm_S (w2[1], w2[2], selector); - w2[1] = __byte_perm_S (w2[0], w2[1], selector); - w2[0] = __byte_perm_S (w1[3], w2[0], selector); - w1[3] = __byte_perm_S (w1[2], w1[3], selector); - w1[2] = __byte_perm_S (w1[1], w1[2], selector); - w1[1] = __byte_perm_S (w1[0], w1[1], selector); - w1[0] = __byte_perm_S (w0[3], w1[0], selector); - w0[3] = __byte_perm_S (w0[2], w0[3], selector); - w0[2] = __byte_perm_S (w0[1], w0[2], selector); - w0[1] = __byte_perm_S (w0[0], w0[1], selector); - w0[0] = __byte_perm_S ( 0, w0[0], selector); - break; - - case 1: - w7[3] = __byte_perm_S (w7[1], w7[2], selector); - w7[2] = __byte_perm_S (w7[0], w7[1], selector); - w7[1] = __byte_perm_S (w6[3], w7[0], selector); - w7[0] = __byte_perm_S (w6[2], w6[3], selector); - w6[3] = __byte_perm_S (w6[1], w6[2], selector); - w6[2] = __byte_perm_S (w6[0], w6[1], selector); - w6[1] = __byte_perm_S (w5[3], w6[0], selector); - w6[0] = __byte_perm_S (w5[2], w5[3], selector); - w5[3] = __byte_perm_S (w5[1], w5[2], selector); - w5[2] = __byte_perm_S (w5[0], w5[1], selector); - w5[1] = __byte_perm_S (w4[3], w5[0], selector); - w5[0] = __byte_perm_S (w4[2], w4[3], selector); - w4[3] = __byte_perm_S (w4[1], w4[2], selector); - w4[2] = __byte_perm_S (w4[0], w4[1], selector); - w4[1] = __byte_perm_S (w3[3], w4[0], selector); - w4[0] = __byte_perm_S (w3[2], w3[3], selector); - w3[3] = __byte_perm_S (w3[1], w3[2], selector); - w3[2] = __byte_perm_S (w3[0], w3[1], selector); - w3[1] = __byte_perm_S (w2[3], w3[0], selector); - w3[0] = __byte_perm_S (w2[2], w2[3], selector); - w2[3] = __byte_perm_S (w2[1], w2[2], selector); - w2[2] = __byte_perm_S (w2[0], w2[1], selector); - w2[1] = __byte_perm_S (w1[3], w2[0], selector); - w2[0] = __byte_perm_S (w1[2], w1[3], selector); - w1[3] = __byte_perm_S (w1[1], w1[2], selector); - w1[2] = __byte_perm_S (w1[0], w1[1], selector); - w1[1] = __byte_perm_S (w0[3], w1[0], selector); - w1[0] = __byte_perm_S (w0[2], w0[3], selector); - w0[3] = __byte_perm_S (w0[1], w0[2], selector); - w0[2] = __byte_perm_S (w0[0], w0[1], selector); - w0[1] = __byte_perm_S ( 0, w0[0], selector); - w0[0] = 0; - break; - - case 2: - w7[3] = __byte_perm_S (w7[0], w7[1], selector); - w7[2] = __byte_perm_S (w6[3], w7[0], selector); - w7[1] = __byte_perm_S (w6[2], w6[3], selector); - w7[0] = __byte_perm_S (w6[1], w6[2], selector); - w6[3] = __byte_perm_S (w6[0], w6[1], selector); - w6[2] = __byte_perm_S (w5[3], w6[0], selector); - w6[1] = __byte_perm_S (w5[2], w5[3], selector); - w6[0] = __byte_perm_S (w5[1], w5[2], selector); - w5[3] = __byte_perm_S (w5[0], w5[1], selector); - w5[2] = __byte_perm_S (w4[3], w5[0], selector); - w5[1] = __byte_perm_S (w4[2], w4[3], selector); - w5[0] = __byte_perm_S (w4[1], w4[2], selector); - w4[3] = __byte_perm_S (w4[0], w4[1], selector); - w4[2] = __byte_perm_S (w3[3], w4[0], selector); - w4[1] = __byte_perm_S (w3[2], w3[3], selector); - w4[0] = __byte_perm_S (w3[1], w3[2], selector); - w3[3] = __byte_perm_S (w3[0], w3[1], selector); - w3[2] = __byte_perm_S (w2[3], w3[0], selector); - w3[1] = __byte_perm_S (w2[2], w2[3], selector); - w3[0] = __byte_perm_S (w2[1], w2[2], selector); - w2[3] = __byte_perm_S (w2[0], w2[1], selector); - w2[2] = __byte_perm_S (w1[3], w2[0], selector); - w2[1] = __byte_perm_S (w1[2], w1[3], selector); - w2[0] = __byte_perm_S (w1[1], w1[2], selector); - w1[3] = __byte_perm_S (w1[0], w1[1], selector); - w1[2] = __byte_perm_S (w0[3], w1[0], selector); - w1[1] = __byte_perm_S (w0[2], w0[3], selector); - w1[0] = __byte_perm_S (w0[1], w0[2], selector); - w0[3] = __byte_perm_S (w0[0], w0[1], selector); - w0[2] = __byte_perm_S ( 0, w0[0], selector); - w0[1] = 0; - w0[0] = 0; - break; - - case 3: - w7[3] = __byte_perm_S (w6[3], w7[0], selector); - w7[2] = __byte_perm_S (w6[2], w6[3], selector); - w7[1] = __byte_perm_S (w6[1], w6[2], selector); - w7[0] = __byte_perm_S (w6[0], w6[1], selector); - w6[3] = __byte_perm_S (w5[3], w6[0], selector); - w6[2] = __byte_perm_S (w5[2], w5[3], selector); - w6[1] = __byte_perm_S (w5[1], w5[2], selector); - w6[0] = __byte_perm_S (w5[0], w5[1], selector); - w5[3] = __byte_perm_S (w4[3], w5[0], selector); - w5[2] = __byte_perm_S (w4[2], w4[3], selector); - w5[1] = __byte_perm_S (w4[1], w4[2], selector); - w5[0] = __byte_perm_S (w4[0], w4[1], selector); - w4[3] = __byte_perm_S (w3[3], w4[0], selector); - w4[2] = __byte_perm_S (w3[2], w3[3], selector); - w4[1] = __byte_perm_S (w3[1], w3[2], selector); - w4[0] = __byte_perm_S (w3[0], w3[1], selector); - w3[3] = __byte_perm_S (w2[3], w3[0], selector); - w3[2] = __byte_perm_S (w2[2], w2[3], selector); - w3[1] = __byte_perm_S (w2[1], w2[2], selector); - w3[0] = __byte_perm_S (w2[0], w2[1], selector); - w2[3] = __byte_perm_S (w1[3], w2[0], selector); - w2[2] = __byte_perm_S (w1[2], w1[3], selector); - w2[1] = __byte_perm_S (w1[1], w1[2], selector); - w2[0] = __byte_perm_S (w1[0], w1[1], selector); - w1[3] = __byte_perm_S (w0[3], w1[0], selector); - w1[2] = __byte_perm_S (w0[2], w0[3], selector); - w1[1] = __byte_perm_S (w0[1], w0[2], selector); - w1[0] = __byte_perm_S (w0[0], w0[1], selector); - w0[3] = __byte_perm_S ( 0, w0[0], selector); - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 4: - w7[3] = __byte_perm_S (w6[2], w6[3], selector); - w7[2] = __byte_perm_S (w6[1], w6[2], selector); - w7[1] = __byte_perm_S (w6[0], w6[1], selector); - w7[0] = __byte_perm_S (w5[3], w6[0], selector); - w6[3] = __byte_perm_S (w5[2], w5[3], selector); - w6[2] = __byte_perm_S (w5[1], w5[2], selector); - w6[1] = __byte_perm_S (w5[0], w5[1], selector); - w6[0] = __byte_perm_S (w4[3], w5[0], selector); - w5[3] = __byte_perm_S (w4[2], w4[3], selector); - w5[2] = __byte_perm_S (w4[1], w4[2], selector); - w5[1] = __byte_perm_S (w4[0], w4[1], selector); - w5[0] = __byte_perm_S (w3[3], w4[0], selector); - w4[3] = __byte_perm_S (w3[2], w3[3], selector); - w4[2] = __byte_perm_S (w3[1], w3[2], selector); - w4[1] = __byte_perm_S (w3[0], w3[1], selector); - w4[0] = __byte_perm_S (w2[3], w3[0], selector); - w3[3] = __byte_perm_S (w2[2], w2[3], selector); - w3[2] = __byte_perm_S (w2[1], w2[2], selector); - w3[1] = __byte_perm_S (w2[0], w2[1], selector); - w3[0] = __byte_perm_S (w1[3], w2[0], selector); - w2[3] = __byte_perm_S (w1[2], w1[3], selector); - w2[2] = __byte_perm_S (w1[1], w1[2], selector); - w2[1] = __byte_perm_S (w1[0], w1[1], selector); - w2[0] = __byte_perm_S (w0[3], w1[0], selector); - w1[3] = __byte_perm_S (w0[2], w0[3], selector); - w1[2] = __byte_perm_S (w0[1], w0[2], selector); - w1[1] = __byte_perm_S (w0[0], w0[1], selector); - w1[0] = __byte_perm_S ( 0, w0[0], selector); - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 5: - w7[3] = __byte_perm_S (w6[1], w6[2], selector); - w7[2] = __byte_perm_S (w6[0], w6[1], selector); - w7[1] = __byte_perm_S (w5[3], w6[0], selector); - w7[0] = __byte_perm_S (w5[2], w5[3], selector); - w6[3] = __byte_perm_S (w5[1], w5[2], selector); - w6[2] = __byte_perm_S (w5[0], w5[1], selector); - w6[1] = __byte_perm_S (w4[3], w5[0], selector); - w6[0] = __byte_perm_S (w4[2], w4[3], selector); - w5[3] = __byte_perm_S (w4[1], w4[2], selector); - w5[2] = __byte_perm_S (w4[0], w4[1], selector); - w5[1] = __byte_perm_S (w3[3], w4[0], selector); - w5[0] = __byte_perm_S (w3[2], w3[3], selector); - w4[3] = __byte_perm_S (w3[1], w3[2], selector); - w4[2] = __byte_perm_S (w3[0], w3[1], selector); - w4[1] = __byte_perm_S (w2[3], w3[0], selector); - w4[0] = __byte_perm_S (w2[2], w2[3], selector); - w3[3] = __byte_perm_S (w2[1], w2[2], selector); - w3[2] = __byte_perm_S (w2[0], w2[1], selector); - w3[1] = __byte_perm_S (w1[3], w2[0], selector); - w3[0] = __byte_perm_S (w1[2], w1[3], selector); - w2[3] = __byte_perm_S (w1[1], w1[2], selector); - w2[2] = __byte_perm_S (w1[0], w1[1], selector); - w2[1] = __byte_perm_S (w0[3], w1[0], selector); - w2[0] = __byte_perm_S (w0[2], w0[3], selector); - w1[3] = __byte_perm_S (w0[1], w0[2], selector); - w1[2] = __byte_perm_S (w0[0], w0[1], selector); - w1[1] = __byte_perm_S ( 0, w0[0], selector); - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 6: - w7[3] = __byte_perm_S (w6[0], w6[1], selector); - w7[2] = __byte_perm_S (w5[3], w6[0], selector); - w7[1] = __byte_perm_S (w5[2], w5[3], selector); - w7[0] = __byte_perm_S (w5[1], w5[2], selector); - w6[3] = __byte_perm_S (w5[0], w5[1], selector); - w6[2] = __byte_perm_S (w4[3], w5[0], selector); - w6[1] = __byte_perm_S (w4[2], w4[3], selector); - w6[0] = __byte_perm_S (w4[1], w4[2], selector); - w5[3] = __byte_perm_S (w4[0], w4[1], selector); - w5[2] = __byte_perm_S (w3[3], w4[0], selector); - w5[1] = __byte_perm_S (w3[2], w3[3], selector); - w5[0] = __byte_perm_S (w3[1], w3[2], selector); - w4[3] = __byte_perm_S (w3[0], w3[1], selector); - w4[2] = __byte_perm_S (w2[3], w3[0], selector); - w4[1] = __byte_perm_S (w2[2], w2[3], selector); - w4[0] = __byte_perm_S (w2[1], w2[2], selector); - w3[3] = __byte_perm_S (w2[0], w2[1], selector); - w3[2] = __byte_perm_S (w1[3], w2[0], selector); - w3[1] = __byte_perm_S (w1[2], w1[3], selector); - w3[0] = __byte_perm_S (w1[1], w1[2], selector); - w2[3] = __byte_perm_S (w1[0], w1[1], selector); - w2[2] = __byte_perm_S (w0[3], w1[0], selector); - w2[1] = __byte_perm_S (w0[2], w0[3], selector); - w2[0] = __byte_perm_S (w0[1], w0[2], selector); - w1[3] = __byte_perm_S (w0[0], w0[1], selector); - w1[2] = __byte_perm_S ( 0, w0[0], selector); - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 7: - w7[3] = __byte_perm_S (w5[3], w6[0], selector); - w7[2] = __byte_perm_S (w5[2], w5[3], selector); - w7[1] = __byte_perm_S (w5[1], w5[2], selector); - w7[0] = __byte_perm_S (w5[0], w5[1], selector); - w6[3] = __byte_perm_S (w4[3], w5[0], selector); - w6[2] = __byte_perm_S (w4[2], w4[3], selector); - w6[1] = __byte_perm_S (w4[1], w4[2], selector); - w6[0] = __byte_perm_S (w4[0], w4[1], selector); - w5[3] = __byte_perm_S (w3[3], w4[0], selector); - w5[2] = __byte_perm_S (w3[2], w3[3], selector); - w5[1] = __byte_perm_S (w3[1], w3[2], selector); - w5[0] = __byte_perm_S (w3[0], w3[1], selector); - w4[3] = __byte_perm_S (w2[3], w3[0], selector); - w4[2] = __byte_perm_S (w2[2], w2[3], selector); - w4[1] = __byte_perm_S (w2[1], w2[2], selector); - w4[0] = __byte_perm_S (w2[0], w2[1], selector); - w3[3] = __byte_perm_S (w1[3], w2[0], selector); - w3[2] = __byte_perm_S (w1[2], w1[3], selector); - w3[1] = __byte_perm_S (w1[1], w1[2], selector); - w3[0] = __byte_perm_S (w1[0], w1[1], selector); - w2[3] = __byte_perm_S (w0[3], w1[0], selector); - w2[2] = __byte_perm_S (w0[2], w0[3], selector); - w2[1] = __byte_perm_S (w0[1], w0[2], selector); - w2[0] = __byte_perm_S (w0[0], w0[1], selector); - w1[3] = __byte_perm_S ( 0, w0[0], selector); - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 8: - w7[3] = __byte_perm_S (w5[2], w5[3], selector); - w7[2] = __byte_perm_S (w5[1], w5[2], selector); - w7[1] = __byte_perm_S (w5[0], w5[1], selector); - w7[0] = __byte_perm_S (w4[3], w5[0], selector); - w6[3] = __byte_perm_S (w4[2], w4[3], selector); - w6[2] = __byte_perm_S (w4[1], w4[2], selector); - w6[1] = __byte_perm_S (w4[0], w4[1], selector); - w6[0] = __byte_perm_S (w3[3], w4[0], selector); - w5[3] = __byte_perm_S (w3[2], w3[3], selector); - w5[2] = __byte_perm_S (w3[1], w3[2], selector); - w5[1] = __byte_perm_S (w3[0], w3[1], selector); - w5[0] = __byte_perm_S (w2[3], w3[0], selector); - w4[3] = __byte_perm_S (w2[2], w2[3], selector); - w4[2] = __byte_perm_S (w2[1], w2[2], selector); - w4[1] = __byte_perm_S (w2[0], w2[1], selector); - w4[0] = __byte_perm_S (w1[3], w2[0], selector); - w3[3] = __byte_perm_S (w1[2], w1[3], selector); - w3[2] = __byte_perm_S (w1[1], w1[2], selector); - w3[1] = __byte_perm_S (w1[0], w1[1], selector); - w3[0] = __byte_perm_S (w0[3], w1[0], selector); - w2[3] = __byte_perm_S (w0[2], w0[3], selector); - w2[2] = __byte_perm_S (w0[1], w0[2], selector); - w2[1] = __byte_perm_S (w0[0], w0[1], selector); - w2[0] = __byte_perm_S ( 0, w0[0], selector); - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 9: - w7[3] = __byte_perm_S (w5[1], w5[2], selector); - w7[2] = __byte_perm_S (w5[0], w5[1], selector); - w7[1] = __byte_perm_S (w4[3], w5[0], selector); - w7[0] = __byte_perm_S (w4[2], w4[3], selector); - w6[3] = __byte_perm_S (w4[1], w4[2], selector); - w6[2] = __byte_perm_S (w4[0], w4[1], selector); - w6[1] = __byte_perm_S (w3[3], w4[0], selector); - w6[0] = __byte_perm_S (w3[2], w3[3], selector); - w5[3] = __byte_perm_S (w3[1], w3[2], selector); - w5[2] = __byte_perm_S (w3[0], w3[1], selector); - w5[1] = __byte_perm_S (w2[3], w3[0], selector); - w5[0] = __byte_perm_S (w2[2], w2[3], selector); - w4[3] = __byte_perm_S (w2[1], w2[2], selector); - w4[2] = __byte_perm_S (w2[0], w2[1], selector); - w4[1] = __byte_perm_S (w1[3], w2[0], selector); - w4[0] = __byte_perm_S (w1[2], w1[3], selector); - w3[3] = __byte_perm_S (w1[1], w1[2], selector); - w3[2] = __byte_perm_S (w1[0], w1[1], selector); - w3[1] = __byte_perm_S (w0[3], w1[0], selector); - w3[0] = __byte_perm_S (w0[2], w0[3], selector); - w2[3] = __byte_perm_S (w0[1], w0[2], selector); - w2[2] = __byte_perm_S (w0[0], w0[1], selector); - w2[1] = __byte_perm_S ( 0, w0[0], selector); - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 10: - w7[3] = __byte_perm_S (w5[0], w5[1], selector); - w7[2] = __byte_perm_S (w4[3], w5[0], selector); - w7[1] = __byte_perm_S (w4[2], w4[3], selector); - w7[0] = __byte_perm_S (w4[1], w4[2], selector); - w6[3] = __byte_perm_S (w4[0], w4[1], selector); - w6[2] = __byte_perm_S (w3[3], w4[0], selector); - w6[1] = __byte_perm_S (w3[2], w3[3], selector); - w6[0] = __byte_perm_S (w3[1], w3[2], selector); - w5[3] = __byte_perm_S (w3[0], w3[1], selector); - w5[2] = __byte_perm_S (w2[3], w3[0], selector); - w5[1] = __byte_perm_S (w2[2], w2[3], selector); - w5[0] = __byte_perm_S (w2[1], w2[2], selector); - w4[3] = __byte_perm_S (w2[0], w2[1], selector); - w4[2] = __byte_perm_S (w1[3], w2[0], selector); - w4[1] = __byte_perm_S (w1[2], w1[3], selector); - w4[0] = __byte_perm_S (w1[1], w1[2], selector); - w3[3] = __byte_perm_S (w1[0], w1[1], selector); - w3[2] = __byte_perm_S (w0[3], w1[0], selector); - w3[1] = __byte_perm_S (w0[2], w0[3], selector); - w3[0] = __byte_perm_S (w0[1], w0[2], selector); - w2[3] = __byte_perm_S (w0[0], w0[1], selector); - w2[2] = __byte_perm_S ( 0, w0[0], selector); - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 11: - w7[3] = __byte_perm_S (w4[3], w5[0], selector); - w7[2] = __byte_perm_S (w4[2], w4[3], selector); - w7[1] = __byte_perm_S (w4[1], w4[2], selector); - w7[0] = __byte_perm_S (w4[0], w4[1], selector); - w6[3] = __byte_perm_S (w3[3], w4[0], selector); - w6[2] = __byte_perm_S (w3[2], w3[3], selector); - w6[1] = __byte_perm_S (w3[1], w3[2], selector); - w6[0] = __byte_perm_S (w3[0], w3[1], selector); - w5[3] = __byte_perm_S (w2[3], w3[0], selector); - w5[2] = __byte_perm_S (w2[2], w2[3], selector); - w5[1] = __byte_perm_S (w2[1], w2[2], selector); - w5[0] = __byte_perm_S (w2[0], w2[1], selector); - w4[3] = __byte_perm_S (w1[3], w2[0], selector); - w4[2] = __byte_perm_S (w1[2], w1[3], selector); - w4[1] = __byte_perm_S (w1[1], w1[2], selector); - w4[0] = __byte_perm_S (w1[0], w1[1], selector); - w3[3] = __byte_perm_S (w0[3], w1[0], selector); - w3[2] = __byte_perm_S (w0[2], w0[3], selector); - w3[1] = __byte_perm_S (w0[1], w0[2], selector); - w3[0] = __byte_perm_S (w0[0], w0[1], selector); - w2[3] = __byte_perm_S ( 0, w0[0], selector); - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 12: - w7[3] = __byte_perm_S (w4[2], w4[3], selector); - w7[2] = __byte_perm_S (w4[1], w4[2], selector); - w7[1] = __byte_perm_S (w4[0], w4[1], selector); - w7[0] = __byte_perm_S (w3[3], w4[0], selector); - w6[3] = __byte_perm_S (w3[2], w3[3], selector); - w6[2] = __byte_perm_S (w3[1], w3[2], selector); - w6[1] = __byte_perm_S (w3[0], w3[1], selector); - w6[0] = __byte_perm_S (w2[3], w3[0], selector); - w5[3] = __byte_perm_S (w2[2], w2[3], selector); - w5[2] = __byte_perm_S (w2[1], w2[2], selector); - w5[1] = __byte_perm_S (w2[0], w2[1], selector); - w5[0] = __byte_perm_S (w1[3], w2[0], selector); - w4[3] = __byte_perm_S (w1[2], w1[3], selector); - w4[2] = __byte_perm_S (w1[1], w1[2], selector); - w4[1] = __byte_perm_S (w1[0], w1[1], selector); - w4[0] = __byte_perm_S (w0[3], w1[0], selector); - w3[3] = __byte_perm_S (w0[2], w0[3], selector); - w3[2] = __byte_perm_S (w0[1], w0[2], selector); - w3[1] = __byte_perm_S (w0[0], w0[1], selector); - w3[0] = __byte_perm_S ( 0, w0[0], selector); - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 13: - w7[3] = __byte_perm_S (w4[1], w4[2], selector); - w7[2] = __byte_perm_S (w4[0], w4[1], selector); - w7[1] = __byte_perm_S (w3[3], w4[0], selector); - w7[0] = __byte_perm_S (w3[2], w3[3], selector); - w6[3] = __byte_perm_S (w3[1], w3[2], selector); - w6[2] = __byte_perm_S (w3[0], w3[1], selector); - w6[1] = __byte_perm_S (w2[3], w3[0], selector); - w6[0] = __byte_perm_S (w2[2], w2[3], selector); - w5[3] = __byte_perm_S (w2[1], w2[2], selector); - w5[2] = __byte_perm_S (w2[0], w2[1], selector); - w5[1] = __byte_perm_S (w1[3], w2[0], selector); - w5[0] = __byte_perm_S (w1[2], w1[3], selector); - w4[3] = __byte_perm_S (w1[1], w1[2], selector); - w4[2] = __byte_perm_S (w1[0], w1[1], selector); - w4[1] = __byte_perm_S (w0[3], w1[0], selector); - w4[0] = __byte_perm_S (w0[2], w0[3], selector); - w3[3] = __byte_perm_S (w0[1], w0[2], selector); - w3[2] = __byte_perm_S (w0[0], w0[1], selector); - w3[1] = __byte_perm_S ( 0, w0[0], selector); - w3[0] = 0; - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 14: - w7[3] = __byte_perm_S (w4[0], w4[1], selector); - w7[2] = __byte_perm_S (w3[3], w4[0], selector); - w7[1] = __byte_perm_S (w3[2], w3[3], selector); - w7[0] = __byte_perm_S (w3[1], w3[2], selector); - w6[3] = __byte_perm_S (w3[0], w3[1], selector); - w6[2] = __byte_perm_S (w2[3], w3[0], selector); - w6[1] = __byte_perm_S (w2[2], w2[3], selector); - w6[0] = __byte_perm_S (w2[1], w2[2], selector); - w5[3] = __byte_perm_S (w2[0], w2[1], selector); - w5[2] = __byte_perm_S (w1[3], w2[0], selector); - w5[1] = __byte_perm_S (w1[2], w1[3], selector); - w5[0] = __byte_perm_S (w1[1], w1[2], selector); - w4[3] = __byte_perm_S (w1[0], w1[1], selector); - w4[2] = __byte_perm_S (w0[3], w1[0], selector); - w4[1] = __byte_perm_S (w0[2], w0[3], selector); - w4[0] = __byte_perm_S (w0[1], w0[2], selector); - w3[3] = __byte_perm_S (w0[0], w0[1], selector); - w3[2] = __byte_perm_S ( 0, w0[0], selector); - w3[1] = 0; - w3[0] = 0; - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 15: - w7[3] = __byte_perm_S (w3[3], w4[0], selector); - w7[2] = __byte_perm_S (w3[2], w3[3], selector); - w7[1] = __byte_perm_S (w3[1], w3[2], selector); - w7[0] = __byte_perm_S (w3[0], w3[1], selector); - w6[3] = __byte_perm_S (w2[3], w3[0], selector); - w6[2] = __byte_perm_S (w2[2], w2[3], selector); - w6[1] = __byte_perm_S (w2[1], w2[2], selector); - w6[0] = __byte_perm_S (w2[0], w2[1], selector); - w5[3] = __byte_perm_S (w1[3], w2[0], selector); - w5[2] = __byte_perm_S (w1[2], w1[3], selector); - w5[1] = __byte_perm_S (w1[1], w1[2], selector); - w5[0] = __byte_perm_S (w1[0], w1[1], selector); - w4[3] = __byte_perm_S (w0[3], w1[0], selector); - w4[2] = __byte_perm_S (w0[2], w0[3], selector); - w4[1] = __byte_perm_S (w0[1], w0[2], selector); - w4[0] = __byte_perm_S (w0[0], w0[1], selector); - w3[3] = __byte_perm_S ( 0, w0[0], selector); - w3[2] = 0; - w3[1] = 0; - w3[0] = 0; - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - } - #endif -} - -#define PACKVS84(s0,s1,s2,s3,s4,s5,s6,s7,v0,v1,v2,v3,v4,v5,v6,v7,e) \ - PACKVS4 (s0, v0, e); \ - PACKVS4 (s1, v1, e); \ - PACKVS4 (s2, v2, e); \ - PACKVS4 (s3, v3, e); \ - PACKVS4 (s4, v4, e); \ - PACKVS4 (s5, v5, e); \ - PACKVS4 (s6, v6, e); \ - PACKVS4 (s7, v7, e); - -#define PACKSV84(s0,s1,s2,s3,s4,s5,s6,s7,v0,v1,v2,v3,v4,v5,v6,v7,e) \ - PACKSV4 (s0, v0, e); \ - PACKSV4 (s1, v1, e); \ - PACKSV4 (s2, v2, e); \ - PACKSV4 (s3, v3, e); \ - PACKSV4 (s4, v4, e); \ - PACKSV4 (s5, v5, e); \ - PACKSV4 (s6, v6, e); \ - PACKSV4 (s7, v7, e); - -inline void switch_buffer_by_offset_8x4_le_VV (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32x offset) -{ - #if VECT_SIZE == 1 - - switch_buffer_by_offset_8x4_le_S (w0, w1, w2, w3, w4, w5, w6, w7, offset); - - #else - - u32 t0[4]; - u32 t1[4]; - u32 t2[4]; - u32 t3[4]; - u32 t4[4]; - u32 t5[4]; - u32 t6[4]; - u32 t7[4]; - - #endif - - #if VECT_SIZE == 2 - - // 1 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s0); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0); - - // 2 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s1); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1); - - #elif VECT_SIZE == 4 - - // 1 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s0); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0); - - // 2 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s1); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1); - - // 3 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 2); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s2); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 2); - - // 4 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 3); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s3); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 3); - - #elif VECT_SIZE == 8 - - // 1 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s0); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0); - - // 2 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s1); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1); - - // 3 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 2); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s2); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 2); - - // 4 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 3); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s3); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 3); - - // 5 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 4); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s4); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 4); - - // 6 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 5); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s5); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 5); - - // 7 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 6); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s6); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 6); - - // 8 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 7); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s7); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 7); - - #elif VECT_SIZE == 16 - - // 1 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s0); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0); - - // 2 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s1); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1); - - // 3 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 2); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s2); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 2); - - // 4 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 3); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s3); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 3); - - // 5 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 4); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s4); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 4); - - // 6 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 5); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s5); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 5); - - // 7 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 6); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s6); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 6); - - // 8 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 7); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s7); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 7); - - // 9 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 8); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s8); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 8); - - // 10 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 9); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s9); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 9); - - // 11 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, a); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.sa); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, a); - - // 12 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, b); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.sb); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, b); - - // 13 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, c); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.sc); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, c); - - // 14 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, d); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.sd); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, d); - - // 15 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, e); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.se); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, e); - - // 16 - PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, f); - switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.sf); - PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, f); - - #endif -} - __constant u64 k_sha512[80] = { SHA512C00, SHA512C01, SHA512C02, SHA512C03, diff --git a/OpenCL/m15000_a3.cl b/OpenCL/m15000_a3.cl index a3db83c55..5451e88be 100644 --- a/OpenCL/m15000_a3.cl +++ b/OpenCL/m15000_a3.cl @@ -14,1619 +14,6 @@ #include "inc_common.cl" #include "inc_simd.cl" -inline void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset) -{ - #if defined IS_AMD || defined IS_GENERIC - const int offset_mod_4 = offset & 3; - - const int offset_minus_4 = 4 - offset; - - switch (offset / 4) - { - case 0: - w7[3] = amd_bytealign_S (w7[3], w7[2], offset_minus_4); - w7[2] = amd_bytealign_S (w7[2], w7[1], offset_minus_4); - w7[1] = amd_bytealign_S (w7[1], w7[0], offset_minus_4); - w7[0] = amd_bytealign_S (w7[0], w6[3], offset_minus_4); - w6[3] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); - w6[2] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); - w6[1] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); - w6[0] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w5[3] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w5[2] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w5[1] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w5[0] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w4[3] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w4[2] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w4[1] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w4[0] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w3[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w3[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w3[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w2[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w2[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w2[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w1[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w1[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w1[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w0[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w0[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w0[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); - - if (offset_mod_4 == 0) - { - w0[0] = w0[1]; - w0[1] = w0[2]; - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - - break; - - case 1: - w7[3] = amd_bytealign_S (w7[2], w7[1], offset_minus_4); - w7[2] = amd_bytealign_S (w7[1], w7[0], offset_minus_4); - w7[1] = amd_bytealign_S (w7[0], w6[3], offset_minus_4); - w7[0] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); - w6[3] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); - w6[2] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); - w6[1] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w6[0] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w5[3] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w5[2] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w5[1] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w5[0] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w4[3] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w4[2] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w4[1] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w4[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w3[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w3[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w2[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w2[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w1[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w1[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w0[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w0[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w0[1] = w0[2]; - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - - break; - - case 2: - w7[3] = amd_bytealign_S (w7[1], w7[0], offset_minus_4); - w7[2] = amd_bytealign_S (w7[0], w6[3], offset_minus_4); - w7[1] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); - w7[0] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); - w6[3] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); - w6[2] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w6[1] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w6[0] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w5[3] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w5[2] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w5[1] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w5[0] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w4[3] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w4[2] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w4[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w4[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w3[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w2[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w1[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w0[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - - break; - - case 3: - w7[3] = amd_bytealign_S (w7[0], w6[3], offset_minus_4); - w7[2] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); - w7[1] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); - w7[0] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); - w6[3] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w6[2] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w6[1] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w6[0] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w5[3] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w5[2] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w5[1] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w5[0] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w4[3] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w4[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w4[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w4[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - - break; - - case 4: - w7[3] = amd_bytealign_S (w6[3], w6[2], offset_minus_4); - w7[2] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); - w7[1] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); - w7[0] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w6[3] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w6[2] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w6[1] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w6[0] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w5[3] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w5[2] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w5[1] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w5[0] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w4[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w4[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w4[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w4[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - - break; - - case 5: - w7[3] = amd_bytealign_S (w6[2], w6[1], offset_minus_4); - w7[2] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); - w7[1] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w7[0] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w6[3] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w6[2] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w6[1] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w6[0] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w5[3] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w5[2] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w5[1] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w5[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w4[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w4[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w4[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w4[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - - break; - - case 6: - w7[3] = amd_bytealign_S (w6[1], w6[0], offset_minus_4); - w7[2] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w7[1] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w7[0] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w6[3] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w6[2] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w6[1] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w6[0] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w5[3] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w5[2] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w5[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w5[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w4[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w4[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w4[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w4[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - - break; - - case 7: - w7[3] = amd_bytealign_S (w6[0], w5[3], offset_minus_4); - w7[2] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w7[1] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w7[0] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w6[3] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w6[2] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w6[1] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w6[0] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w5[3] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w5[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w5[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w5[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w4[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w4[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w4[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w4[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - - break; - - case 8: - w7[3] = amd_bytealign_S (w5[3], w5[2], offset_minus_4); - w7[2] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w7[1] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w7[0] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w6[3] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w6[2] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w6[1] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w6[0] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w5[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w5[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w5[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w5[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w4[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w4[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w4[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w4[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - - break; - - case 9: - w7[3] = amd_bytealign_S (w5[2], w5[1], offset_minus_4); - w7[2] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w7[1] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w7[0] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w6[3] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w6[2] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w6[1] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w6[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w5[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w5[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w5[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w5[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w4[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w4[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w4[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w4[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - - break; - - case 10: - w7[3] = amd_bytealign_S (w5[1], w5[0], offset_minus_4); - w7[2] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w7[1] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w7[0] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w6[3] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w6[2] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w6[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w6[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w5[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w5[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w5[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w5[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w4[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w4[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w4[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w4[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - - break; - - case 11: - w7[3] = amd_bytealign_S (w5[0], w5[3], offset_minus_4); - w7[2] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w7[1] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w7[0] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w6[3] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w6[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w6[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w6[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w5[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w5[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w5[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w5[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w4[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w4[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w4[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w4[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - - break; - - case 12: - w7[3] = amd_bytealign_S (w4[3], w4[2], offset_minus_4); - w7[2] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w7[1] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w7[0] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w6[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w6[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w6[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w6[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w5[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w5[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w5[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w5[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w4[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w4[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w4[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w4[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - - break; - - case 13: - w7[3] = amd_bytealign_S (w4[2], w4[1], offset_minus_4); - w7[2] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w7[1] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w7[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w6[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w6[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w6[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w6[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w5[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w5[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w5[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w5[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w4[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w4[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w4[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w4[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w3[0] = 0; - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - - break; - - case 14: - w7[3] = amd_bytealign_S (w4[1], w4[0], offset_minus_4); - w7[2] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w7[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w7[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w6[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w6[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w6[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w6[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w5[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w5[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w5[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w5[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w4[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w4[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w4[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w4[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w3[1] = 0; - w3[0] = 0; - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w3[2] = w3[3]; - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - - break; - - case 15: - w7[3] = amd_bytealign_S (w4[0], w3[3], offset_minus_4); - w7[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w7[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w7[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w6[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w6[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w6[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w6[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w5[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w5[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w5[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w5[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w4[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w4[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w4[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w4[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w3[2] = 0; - w3[1] = 0; - w3[0] = 0; - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w3[3] = w4[0]; - w4[0] = w4[1]; - w4[1] = w4[2]; - w4[2] = w4[3]; - w4[3] = w5[0]; - w5[0] = w5[1]; - w5[1] = w5[2]; - w5[2] = w5[3]; - w5[3] = w6[0]; - w6[0] = w6[1]; - w6[1] = w6[2]; - w6[2] = w6[3]; - w6[3] = w7[0]; - w7[0] = w7[1]; - w7[1] = w7[2]; - w7[2] = w7[3]; - w7[3] = 0; - } - - break; - } - #endif - - #ifdef IS_NV - const int offset_minus_4 = 4 - (offset % 4); - - const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; - - switch (offset / 4) - { - case 0: - w7[3] = __byte_perm_S (w7[2], w7[3], selector); - w7[2] = __byte_perm_S (w7[1], w7[2], selector); - w7[1] = __byte_perm_S (w7[0], w7[1], selector); - w7[0] = __byte_perm_S (w6[3], w7[0], selector); - w6[3] = __byte_perm_S (w6[2], w6[3], selector); - w6[2] = __byte_perm_S (w6[1], w6[2], selector); - w6[1] = __byte_perm_S (w6[0], w6[1], selector); - w6[0] = __byte_perm_S (w5[3], w6[0], selector); - w5[3] = __byte_perm_S (w5[2], w5[3], selector); - w5[2] = __byte_perm_S (w5[1], w5[2], selector); - w5[1] = __byte_perm_S (w5[0], w5[1], selector); - w5[0] = __byte_perm_S (w4[3], w5[0], selector); - w4[3] = __byte_perm_S (w4[2], w4[3], selector); - w4[2] = __byte_perm_S (w4[1], w4[2], selector); - w4[1] = __byte_perm_S (w4[0], w4[1], selector); - w4[0] = __byte_perm_S (w3[3], w4[0], selector); - w3[3] = __byte_perm_S (w3[2], w3[3], selector); - w3[2] = __byte_perm_S (w3[1], w3[2], selector); - w3[1] = __byte_perm_S (w3[0], w3[1], selector); - w3[0] = __byte_perm_S (w2[3], w3[0], selector); - w2[3] = __byte_perm_S (w2[2], w2[3], selector); - w2[2] = __byte_perm_S (w2[1], w2[2], selector); - w2[1] = __byte_perm_S (w2[0], w2[1], selector); - w2[0] = __byte_perm_S (w1[3], w2[0], selector); - w1[3] = __byte_perm_S (w1[2], w1[3], selector); - w1[2] = __byte_perm_S (w1[1], w1[2], selector); - w1[1] = __byte_perm_S (w1[0], w1[1], selector); - w1[0] = __byte_perm_S (w0[3], w1[0], selector); - w0[3] = __byte_perm_S (w0[2], w0[3], selector); - w0[2] = __byte_perm_S (w0[1], w0[2], selector); - w0[1] = __byte_perm_S (w0[0], w0[1], selector); - w0[0] = __byte_perm_S ( 0, w0[0], selector); - break; - - case 1: - w7[3] = __byte_perm_S (w7[1], w7[2], selector); - w7[2] = __byte_perm_S (w7[0], w7[1], selector); - w7[1] = __byte_perm_S (w6[3], w7[0], selector); - w7[0] = __byte_perm_S (w6[2], w6[3], selector); - w6[3] = __byte_perm_S (w6[1], w6[2], selector); - w6[2] = __byte_perm_S (w6[0], w6[1], selector); - w6[1] = __byte_perm_S (w5[3], w6[0], selector); - w6[0] = __byte_perm_S (w5[2], w5[3], selector); - w5[3] = __byte_perm_S (w5[1], w5[2], selector); - w5[2] = __byte_perm_S (w5[0], w5[1], selector); - w5[1] = __byte_perm_S (w4[3], w5[0], selector); - w5[0] = __byte_perm_S (w4[2], w4[3], selector); - w4[3] = __byte_perm_S (w4[1], w4[2], selector); - w4[2] = __byte_perm_S (w4[0], w4[1], selector); - w4[1] = __byte_perm_S (w3[3], w4[0], selector); - w4[0] = __byte_perm_S (w3[2], w3[3], selector); - w3[3] = __byte_perm_S (w3[1], w3[2], selector); - w3[2] = __byte_perm_S (w3[0], w3[1], selector); - w3[1] = __byte_perm_S (w2[3], w3[0], selector); - w3[0] = __byte_perm_S (w2[2], w2[3], selector); - w2[3] = __byte_perm_S (w2[1], w2[2], selector); - w2[2] = __byte_perm_S (w2[0], w2[1], selector); - w2[1] = __byte_perm_S (w1[3], w2[0], selector); - w2[0] = __byte_perm_S (w1[2], w1[3], selector); - w1[3] = __byte_perm_S (w1[1], w1[2], selector); - w1[2] = __byte_perm_S (w1[0], w1[1], selector); - w1[1] = __byte_perm_S (w0[3], w1[0], selector); - w1[0] = __byte_perm_S (w0[2], w0[3], selector); - w0[3] = __byte_perm_S (w0[1], w0[2], selector); - w0[2] = __byte_perm_S (w0[0], w0[1], selector); - w0[1] = __byte_perm_S ( 0, w0[0], selector); - w0[0] = 0; - break; - - case 2: - w7[3] = __byte_perm_S (w7[0], w7[1], selector); - w7[2] = __byte_perm_S (w6[3], w7[0], selector); - w7[1] = __byte_perm_S (w6[2], w6[3], selector); - w7[0] = __byte_perm_S (w6[1], w6[2], selector); - w6[3] = __byte_perm_S (w6[0], w6[1], selector); - w6[2] = __byte_perm_S (w5[3], w6[0], selector); - w6[1] = __byte_perm_S (w5[2], w5[3], selector); - w6[0] = __byte_perm_S (w5[1], w5[2], selector); - w5[3] = __byte_perm_S (w5[0], w5[1], selector); - w5[2] = __byte_perm_S (w4[3], w5[0], selector); - w5[1] = __byte_perm_S (w4[2], w4[3], selector); - w5[0] = __byte_perm_S (w4[1], w4[2], selector); - w4[3] = __byte_perm_S (w4[0], w4[1], selector); - w4[2] = __byte_perm_S (w3[3], w4[0], selector); - w4[1] = __byte_perm_S (w3[2], w3[3], selector); - w4[0] = __byte_perm_S (w3[1], w3[2], selector); - w3[3] = __byte_perm_S (w3[0], w3[1], selector); - w3[2] = __byte_perm_S (w2[3], w3[0], selector); - w3[1] = __byte_perm_S (w2[2], w2[3], selector); - w3[0] = __byte_perm_S (w2[1], w2[2], selector); - w2[3] = __byte_perm_S (w2[0], w2[1], selector); - w2[2] = __byte_perm_S (w1[3], w2[0], selector); - w2[1] = __byte_perm_S (w1[2], w1[3], selector); - w2[0] = __byte_perm_S (w1[1], w1[2], selector); - w1[3] = __byte_perm_S (w1[0], w1[1], selector); - w1[2] = __byte_perm_S (w0[3], w1[0], selector); - w1[1] = __byte_perm_S (w0[2], w0[3], selector); - w1[0] = __byte_perm_S (w0[1], w0[2], selector); - w0[3] = __byte_perm_S (w0[0], w0[1], selector); - w0[2] = __byte_perm_S ( 0, w0[0], selector); - w0[1] = 0; - w0[0] = 0; - break; - - case 3: - w7[3] = __byte_perm_S (w6[3], w7[0], selector); - w7[2] = __byte_perm_S (w6[2], w6[3], selector); - w7[1] = __byte_perm_S (w6[1], w6[2], selector); - w7[0] = __byte_perm_S (w6[0], w6[1], selector); - w6[3] = __byte_perm_S (w5[3], w6[0], selector); - w6[2] = __byte_perm_S (w5[2], w5[3], selector); - w6[1] = __byte_perm_S (w5[1], w5[2], selector); - w6[0] = __byte_perm_S (w5[0], w5[1], selector); - w5[3] = __byte_perm_S (w4[3], w5[0], selector); - w5[2] = __byte_perm_S (w4[2], w4[3], selector); - w5[1] = __byte_perm_S (w4[1], w4[2], selector); - w5[0] = __byte_perm_S (w4[0], w4[1], selector); - w4[3] = __byte_perm_S (w3[3], w4[0], selector); - w4[2] = __byte_perm_S (w3[2], w3[3], selector); - w4[1] = __byte_perm_S (w3[1], w3[2], selector); - w4[0] = __byte_perm_S (w3[0], w3[1], selector); - w3[3] = __byte_perm_S (w2[3], w3[0], selector); - w3[2] = __byte_perm_S (w2[2], w2[3], selector); - w3[1] = __byte_perm_S (w2[1], w2[2], selector); - w3[0] = __byte_perm_S (w2[0], w2[1], selector); - w2[3] = __byte_perm_S (w1[3], w2[0], selector); - w2[2] = __byte_perm_S (w1[2], w1[3], selector); - w2[1] = __byte_perm_S (w1[1], w1[2], selector); - w2[0] = __byte_perm_S (w1[0], w1[1], selector); - w1[3] = __byte_perm_S (w0[3], w1[0], selector); - w1[2] = __byte_perm_S (w0[2], w0[3], selector); - w1[1] = __byte_perm_S (w0[1], w0[2], selector); - w1[0] = __byte_perm_S (w0[0], w0[1], selector); - w0[3] = __byte_perm_S ( 0, w0[0], selector); - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 4: - w7[3] = __byte_perm_S (w6[2], w6[3], selector); - w7[2] = __byte_perm_S (w6[1], w6[2], selector); - w7[1] = __byte_perm_S (w6[0], w6[1], selector); - w7[0] = __byte_perm_S (w5[3], w6[0], selector); - w6[3] = __byte_perm_S (w5[2], w5[3], selector); - w6[2] = __byte_perm_S (w5[1], w5[2], selector); - w6[1] = __byte_perm_S (w5[0], w5[1], selector); - w6[0] = __byte_perm_S (w4[3], w5[0], selector); - w5[3] = __byte_perm_S (w4[2], w4[3], selector); - w5[2] = __byte_perm_S (w4[1], w4[2], selector); - w5[1] = __byte_perm_S (w4[0], w4[1], selector); - w5[0] = __byte_perm_S (w3[3], w4[0], selector); - w4[3] = __byte_perm_S (w3[2], w3[3], selector); - w4[2] = __byte_perm_S (w3[1], w3[2], selector); - w4[1] = __byte_perm_S (w3[0], w3[1], selector); - w4[0] = __byte_perm_S (w2[3], w3[0], selector); - w3[3] = __byte_perm_S (w2[2], w2[3], selector); - w3[2] = __byte_perm_S (w2[1], w2[2], selector); - w3[1] = __byte_perm_S (w2[0], w2[1], selector); - w3[0] = __byte_perm_S (w1[3], w2[0], selector); - w2[3] = __byte_perm_S (w1[2], w1[3], selector); - w2[2] = __byte_perm_S (w1[1], w1[2], selector); - w2[1] = __byte_perm_S (w1[0], w1[1], selector); - w2[0] = __byte_perm_S (w0[3], w1[0], selector); - w1[3] = __byte_perm_S (w0[2], w0[3], selector); - w1[2] = __byte_perm_S (w0[1], w0[2], selector); - w1[1] = __byte_perm_S (w0[0], w0[1], selector); - w1[0] = __byte_perm_S ( 0, w0[0], selector); - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 5: - w7[3] = __byte_perm_S (w6[1], w6[2], selector); - w7[2] = __byte_perm_S (w6[0], w6[1], selector); - w7[1] = __byte_perm_S (w5[3], w6[0], selector); - w7[0] = __byte_perm_S (w5[2], w5[3], selector); - w6[3] = __byte_perm_S (w5[1], w5[2], selector); - w6[2] = __byte_perm_S (w5[0], w5[1], selector); - w6[1] = __byte_perm_S (w4[3], w5[0], selector); - w6[0] = __byte_perm_S (w4[2], w4[3], selector); - w5[3] = __byte_perm_S (w4[1], w4[2], selector); - w5[2] = __byte_perm_S (w4[0], w4[1], selector); - w5[1] = __byte_perm_S (w3[3], w4[0], selector); - w5[0] = __byte_perm_S (w3[2], w3[3], selector); - w4[3] = __byte_perm_S (w3[1], w3[2], selector); - w4[2] = __byte_perm_S (w3[0], w3[1], selector); - w4[1] = __byte_perm_S (w2[3], w3[0], selector); - w4[0] = __byte_perm_S (w2[2], w2[3], selector); - w3[3] = __byte_perm_S (w2[1], w2[2], selector); - w3[2] = __byte_perm_S (w2[0], w2[1], selector); - w3[1] = __byte_perm_S (w1[3], w2[0], selector); - w3[0] = __byte_perm_S (w1[2], w1[3], selector); - w2[3] = __byte_perm_S (w1[1], w1[2], selector); - w2[2] = __byte_perm_S (w1[0], w1[1], selector); - w2[1] = __byte_perm_S (w0[3], w1[0], selector); - w2[0] = __byte_perm_S (w0[2], w0[3], selector); - w1[3] = __byte_perm_S (w0[1], w0[2], selector); - w1[2] = __byte_perm_S (w0[0], w0[1], selector); - w1[1] = __byte_perm_S ( 0, w0[0], selector); - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 6: - w7[3] = __byte_perm_S (w6[0], w6[1], selector); - w7[2] = __byte_perm_S (w5[3], w6[0], selector); - w7[1] = __byte_perm_S (w5[2], w5[3], selector); - w7[0] = __byte_perm_S (w5[1], w5[2], selector); - w6[3] = __byte_perm_S (w5[0], w5[1], selector); - w6[2] = __byte_perm_S (w4[3], w5[0], selector); - w6[1] = __byte_perm_S (w4[2], w4[3], selector); - w6[0] = __byte_perm_S (w4[1], w4[2], selector); - w5[3] = __byte_perm_S (w4[0], w4[1], selector); - w5[2] = __byte_perm_S (w3[3], w4[0], selector); - w5[1] = __byte_perm_S (w3[2], w3[3], selector); - w5[0] = __byte_perm_S (w3[1], w3[2], selector); - w4[3] = __byte_perm_S (w3[0], w3[1], selector); - w4[2] = __byte_perm_S (w2[3], w3[0], selector); - w4[1] = __byte_perm_S (w2[2], w2[3], selector); - w4[0] = __byte_perm_S (w2[1], w2[2], selector); - w3[3] = __byte_perm_S (w2[0], w2[1], selector); - w3[2] = __byte_perm_S (w1[3], w2[0], selector); - w3[1] = __byte_perm_S (w1[2], w1[3], selector); - w3[0] = __byte_perm_S (w1[1], w1[2], selector); - w2[3] = __byte_perm_S (w1[0], w1[1], selector); - w2[2] = __byte_perm_S (w0[3], w1[0], selector); - w2[1] = __byte_perm_S (w0[2], w0[3], selector); - w2[0] = __byte_perm_S (w0[1], w0[2], selector); - w1[3] = __byte_perm_S (w0[0], w0[1], selector); - w1[2] = __byte_perm_S ( 0, w0[0], selector); - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 7: - w7[3] = __byte_perm_S (w5[3], w6[0], selector); - w7[2] = __byte_perm_S (w5[2], w5[3], selector); - w7[1] = __byte_perm_S (w5[1], w5[2], selector); - w7[0] = __byte_perm_S (w5[0], w5[1], selector); - w6[3] = __byte_perm_S (w4[3], w5[0], selector); - w6[2] = __byte_perm_S (w4[2], w4[3], selector); - w6[1] = __byte_perm_S (w4[1], w4[2], selector); - w6[0] = __byte_perm_S (w4[0], w4[1], selector); - w5[3] = __byte_perm_S (w3[3], w4[0], selector); - w5[2] = __byte_perm_S (w3[2], w3[3], selector); - w5[1] = __byte_perm_S (w3[1], w3[2], selector); - w5[0] = __byte_perm_S (w3[0], w3[1], selector); - w4[3] = __byte_perm_S (w2[3], w3[0], selector); - w4[2] = __byte_perm_S (w2[2], w2[3], selector); - w4[1] = __byte_perm_S (w2[1], w2[2], selector); - w4[0] = __byte_perm_S (w2[0], w2[1], selector); - w3[3] = __byte_perm_S (w1[3], w2[0], selector); - w3[2] = __byte_perm_S (w1[2], w1[3], selector); - w3[1] = __byte_perm_S (w1[1], w1[2], selector); - w3[0] = __byte_perm_S (w1[0], w1[1], selector); - w2[3] = __byte_perm_S (w0[3], w1[0], selector); - w2[2] = __byte_perm_S (w0[2], w0[3], selector); - w2[1] = __byte_perm_S (w0[1], w0[2], selector); - w2[0] = __byte_perm_S (w0[0], w0[1], selector); - w1[3] = __byte_perm_S ( 0, w0[0], selector); - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 8: - w7[3] = __byte_perm_S (w5[2], w5[3], selector); - w7[2] = __byte_perm_S (w5[1], w5[2], selector); - w7[1] = __byte_perm_S (w5[0], w5[1], selector); - w7[0] = __byte_perm_S (w4[3], w5[0], selector); - w6[3] = __byte_perm_S (w4[2], w4[3], selector); - w6[2] = __byte_perm_S (w4[1], w4[2], selector); - w6[1] = __byte_perm_S (w4[0], w4[1], selector); - w6[0] = __byte_perm_S (w3[3], w4[0], selector); - w5[3] = __byte_perm_S (w3[2], w3[3], selector); - w5[2] = __byte_perm_S (w3[1], w3[2], selector); - w5[1] = __byte_perm_S (w3[0], w3[1], selector); - w5[0] = __byte_perm_S (w2[3], w3[0], selector); - w4[3] = __byte_perm_S (w2[2], w2[3], selector); - w4[2] = __byte_perm_S (w2[1], w2[2], selector); - w4[1] = __byte_perm_S (w2[0], w2[1], selector); - w4[0] = __byte_perm_S (w1[3], w2[0], selector); - w3[3] = __byte_perm_S (w1[2], w1[3], selector); - w3[2] = __byte_perm_S (w1[1], w1[2], selector); - w3[1] = __byte_perm_S (w1[0], w1[1], selector); - w3[0] = __byte_perm_S (w0[3], w1[0], selector); - w2[3] = __byte_perm_S (w0[2], w0[3], selector); - w2[2] = __byte_perm_S (w0[1], w0[2], selector); - w2[1] = __byte_perm_S (w0[0], w0[1], selector); - w2[0] = __byte_perm_S ( 0, w0[0], selector); - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 9: - w7[3] = __byte_perm_S (w5[1], w5[2], selector); - w7[2] = __byte_perm_S (w5[0], w5[1], selector); - w7[1] = __byte_perm_S (w4[3], w5[0], selector); - w7[0] = __byte_perm_S (w4[2], w4[3], selector); - w6[3] = __byte_perm_S (w4[1], w4[2], selector); - w6[2] = __byte_perm_S (w4[0], w4[1], selector); - w6[1] = __byte_perm_S (w3[3], w4[0], selector); - w6[0] = __byte_perm_S (w3[2], w3[3], selector); - w5[3] = __byte_perm_S (w3[1], w3[2], selector); - w5[2] = __byte_perm_S (w3[0], w3[1], selector); - w5[1] = __byte_perm_S (w2[3], w3[0], selector); - w5[0] = __byte_perm_S (w2[2], w2[3], selector); - w4[3] = __byte_perm_S (w2[1], w2[2], selector); - w4[2] = __byte_perm_S (w2[0], w2[1], selector); - w4[1] = __byte_perm_S (w1[3], w2[0], selector); - w4[0] = __byte_perm_S (w1[2], w1[3], selector); - w3[3] = __byte_perm_S (w1[1], w1[2], selector); - w3[2] = __byte_perm_S (w1[0], w1[1], selector); - w3[1] = __byte_perm_S (w0[3], w1[0], selector); - w3[0] = __byte_perm_S (w0[2], w0[3], selector); - w2[3] = __byte_perm_S (w0[1], w0[2], selector); - w2[2] = __byte_perm_S (w0[0], w0[1], selector); - w2[1] = __byte_perm_S ( 0, w0[0], selector); - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 10: - w7[3] = __byte_perm_S (w5[0], w5[1], selector); - w7[2] = __byte_perm_S (w4[3], w5[0], selector); - w7[1] = __byte_perm_S (w4[2], w4[3], selector); - w7[0] = __byte_perm_S (w4[1], w4[2], selector); - w6[3] = __byte_perm_S (w4[0], w4[1], selector); - w6[2] = __byte_perm_S (w3[3], w4[0], selector); - w6[1] = __byte_perm_S (w3[2], w3[3], selector); - w6[0] = __byte_perm_S (w3[1], w3[2], selector); - w5[3] = __byte_perm_S (w3[0], w3[1], selector); - w5[2] = __byte_perm_S (w2[3], w3[0], selector); - w5[1] = __byte_perm_S (w2[2], w2[3], selector); - w5[0] = __byte_perm_S (w2[1], w2[2], selector); - w4[3] = __byte_perm_S (w2[0], w2[1], selector); - w4[2] = __byte_perm_S (w1[3], w2[0], selector); - w4[1] = __byte_perm_S (w1[2], w1[3], selector); - w4[0] = __byte_perm_S (w1[1], w1[2], selector); - w3[3] = __byte_perm_S (w1[0], w1[1], selector); - w3[2] = __byte_perm_S (w0[3], w1[0], selector); - w3[1] = __byte_perm_S (w0[2], w0[3], selector); - w3[0] = __byte_perm_S (w0[1], w0[2], selector); - w2[3] = __byte_perm_S (w0[0], w0[1], selector); - w2[2] = __byte_perm_S ( 0, w0[0], selector); - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 11: - w7[3] = __byte_perm_S (w4[3], w5[0], selector); - w7[2] = __byte_perm_S (w4[2], w4[3], selector); - w7[1] = __byte_perm_S (w4[1], w4[2], selector); - w7[0] = __byte_perm_S (w4[0], w4[1], selector); - w6[3] = __byte_perm_S (w3[3], w4[0], selector); - w6[2] = __byte_perm_S (w3[2], w3[3], selector); - w6[1] = __byte_perm_S (w3[1], w3[2], selector); - w6[0] = __byte_perm_S (w3[0], w3[1], selector); - w5[3] = __byte_perm_S (w2[3], w3[0], selector); - w5[2] = __byte_perm_S (w2[2], w2[3], selector); - w5[1] = __byte_perm_S (w2[1], w2[2], selector); - w5[0] = __byte_perm_S (w2[0], w2[1], selector); - w4[3] = __byte_perm_S (w1[3], w2[0], selector); - w4[2] = __byte_perm_S (w1[2], w1[3], selector); - w4[1] = __byte_perm_S (w1[1], w1[2], selector); - w4[0] = __byte_perm_S (w1[0], w1[1], selector); - w3[3] = __byte_perm_S (w0[3], w1[0], selector); - w3[2] = __byte_perm_S (w0[2], w0[3], selector); - w3[1] = __byte_perm_S (w0[1], w0[2], selector); - w3[0] = __byte_perm_S (w0[0], w0[1], selector); - w2[3] = __byte_perm_S ( 0, w0[0], selector); - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 12: - w7[3] = __byte_perm_S (w4[2], w4[3], selector); - w7[2] = __byte_perm_S (w4[1], w4[2], selector); - w7[1] = __byte_perm_S (w4[0], w4[1], selector); - w7[0] = __byte_perm_S (w3[3], w4[0], selector); - w6[3] = __byte_perm_S (w3[2], w3[3], selector); - w6[2] = __byte_perm_S (w3[1], w3[2], selector); - w6[1] = __byte_perm_S (w3[0], w3[1], selector); - w6[0] = __byte_perm_S (w2[3], w3[0], selector); - w5[3] = __byte_perm_S (w2[2], w2[3], selector); - w5[2] = __byte_perm_S (w2[1], w2[2], selector); - w5[1] = __byte_perm_S (w2[0], w2[1], selector); - w5[0] = __byte_perm_S (w1[3], w2[0], selector); - w4[3] = __byte_perm_S (w1[2], w1[3], selector); - w4[2] = __byte_perm_S (w1[1], w1[2], selector); - w4[1] = __byte_perm_S (w1[0], w1[1], selector); - w4[0] = __byte_perm_S (w0[3], w1[0], selector); - w3[3] = __byte_perm_S (w0[2], w0[3], selector); - w3[2] = __byte_perm_S (w0[1], w0[2], selector); - w3[1] = __byte_perm_S (w0[0], w0[1], selector); - w3[0] = __byte_perm_S ( 0, w0[0], selector); - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 13: - w7[3] = __byte_perm_S (w4[1], w4[2], selector); - w7[2] = __byte_perm_S (w4[0], w4[1], selector); - w7[1] = __byte_perm_S (w3[3], w4[0], selector); - w7[0] = __byte_perm_S (w3[2], w3[3], selector); - w6[3] = __byte_perm_S (w3[1], w3[2], selector); - w6[2] = __byte_perm_S (w3[0], w3[1], selector); - w6[1] = __byte_perm_S (w2[3], w3[0], selector); - w6[0] = __byte_perm_S (w2[2], w2[3], selector); - w5[3] = __byte_perm_S (w2[1], w2[2], selector); - w5[2] = __byte_perm_S (w2[0], w2[1], selector); - w5[1] = __byte_perm_S (w1[3], w2[0], selector); - w5[0] = __byte_perm_S (w1[2], w1[3], selector); - w4[3] = __byte_perm_S (w1[1], w1[2], selector); - w4[2] = __byte_perm_S (w1[0], w1[1], selector); - w4[1] = __byte_perm_S (w0[3], w1[0], selector); - w4[0] = __byte_perm_S (w0[2], w0[3], selector); - w3[3] = __byte_perm_S (w0[1], w0[2], selector); - w3[2] = __byte_perm_S (w0[0], w0[1], selector); - w3[1] = __byte_perm_S ( 0, w0[0], selector); - w3[0] = 0; - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 14: - w7[3] = __byte_perm_S (w4[0], w4[1], selector); - w7[2] = __byte_perm_S (w3[3], w4[0], selector); - w7[1] = __byte_perm_S (w3[2], w3[3], selector); - w7[0] = __byte_perm_S (w3[1], w3[2], selector); - w6[3] = __byte_perm_S (w3[0], w3[1], selector); - w6[2] = __byte_perm_S (w2[3], w3[0], selector); - w6[1] = __byte_perm_S (w2[2], w2[3], selector); - w6[0] = __byte_perm_S (w2[1], w2[2], selector); - w5[3] = __byte_perm_S (w2[0], w2[1], selector); - w5[2] = __byte_perm_S (w1[3], w2[0], selector); - w5[1] = __byte_perm_S (w1[2], w1[3], selector); - w5[0] = __byte_perm_S (w1[1], w1[2], selector); - w4[3] = __byte_perm_S (w1[0], w1[1], selector); - w4[2] = __byte_perm_S (w0[3], w1[0], selector); - w4[1] = __byte_perm_S (w0[2], w0[3], selector); - w4[0] = __byte_perm_S (w0[1], w0[2], selector); - w3[3] = __byte_perm_S (w0[0], w0[1], selector); - w3[2] = __byte_perm_S ( 0, w0[0], selector); - w3[1] = 0; - w3[0] = 0; - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - - case 15: - w7[3] = __byte_perm_S (w3[3], w4[0], selector); - w7[2] = __byte_perm_S (w3[2], w3[3], selector); - w7[1] = __byte_perm_S (w3[1], w3[2], selector); - w7[0] = __byte_perm_S (w3[0], w3[1], selector); - w6[3] = __byte_perm_S (w2[3], w3[0], selector); - w6[2] = __byte_perm_S (w2[2], w2[3], selector); - w6[1] = __byte_perm_S (w2[1], w2[2], selector); - w6[0] = __byte_perm_S (w2[0], w2[1], selector); - w5[3] = __byte_perm_S (w1[3], w2[0], selector); - w5[2] = __byte_perm_S (w1[2], w1[3], selector); - w5[1] = __byte_perm_S (w1[1], w1[2], selector); - w5[0] = __byte_perm_S (w1[0], w1[1], selector); - w4[3] = __byte_perm_S (w0[3], w1[0], selector); - w4[2] = __byte_perm_S (w0[2], w0[3], selector); - w4[1] = __byte_perm_S (w0[1], w0[2], selector); - w4[0] = __byte_perm_S (w0[0], w0[1], selector); - w3[3] = __byte_perm_S ( 0, w0[0], selector); - w3[2] = 0; - w3[1] = 0; - w3[0] = 0; - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - break; - } - #endif -} - __constant u64 k_sha512[80] = { SHA512C00, SHA512C01, SHA512C02, SHA512C03,