diff --git a/OpenCL/amp_a1.cl b/OpenCL/amp_a1.cl index bf191573c..f0b5db0df 100644 --- a/OpenCL/amp_a1.cl +++ b/OpenCL/amp_a1.cl @@ -7,7 +7,7 @@ #include "inc_vendor.cl" #include "inc_types.cl" -inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) +inline void switch_buffer_by_offset_le_S (u32 w[64], const u32 offset) { #if defined IS_AMD || defined IS_GENERIC const int offset_mod_4 = offset & 3; @@ -16,524 +16,4229 @@ inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w switch (offset / 4) { - case 0: - w3[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); - w3[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w3[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w2[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w2[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w2[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w1[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w1[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w1[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w0[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w0[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w0[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); - - if (offset_mod_4 == 0) - { - w0[0] = w0[1]; - w0[1] = w0[2]; - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - + case 0: + w[63] = amd_bytealign_S (w[63], w[62], offset_minus_4); + w[62] = amd_bytealign_S (w[62], w[61], offset_minus_4); + w[61] = amd_bytealign_S (w[61], w[60], offset_minus_4); + w[60] = amd_bytealign_S (w[60], w[59], offset_minus_4); + w[59] = amd_bytealign_S (w[59], w[58], offset_minus_4); + w[58] = amd_bytealign_S (w[58], w[57], offset_minus_4); + w[57] = amd_bytealign_S (w[57], w[56], offset_minus_4); + w[56] = amd_bytealign_S (w[56], w[55], offset_minus_4); + w[55] = amd_bytealign_S (w[55], w[54], offset_minus_4); + w[54] = amd_bytealign_S (w[54], w[53], offset_minus_4); + w[53] = amd_bytealign_S (w[53], w[52], offset_minus_4); + w[52] = amd_bytealign_S (w[52], w[51], offset_minus_4); + w[51] = amd_bytealign_S (w[51], w[50], offset_minus_4); + w[50] = amd_bytealign_S (w[50], w[49], offset_minus_4); + w[49] = amd_bytealign_S (w[49], w[48], offset_minus_4); + w[48] = amd_bytealign_S (w[48], w[47], offset_minus_4); + w[47] = amd_bytealign_S (w[47], w[46], offset_minus_4); + w[46] = amd_bytealign_S (w[46], w[45], offset_minus_4); + w[45] = amd_bytealign_S (w[45], w[44], offset_minus_4); + w[44] = amd_bytealign_S (w[44], w[43], offset_minus_4); + w[43] = amd_bytealign_S (w[43], w[42], offset_minus_4); + w[42] = amd_bytealign_S (w[42], w[41], offset_minus_4); + w[41] = amd_bytealign_S (w[41], w[40], offset_minus_4); + w[40] = amd_bytealign_S (w[40], w[39], offset_minus_4); + w[39] = amd_bytealign_S (w[39], w[38], offset_minus_4); + w[38] = amd_bytealign_S (w[38], w[37], offset_minus_4); + w[37] = amd_bytealign_S (w[37], w[36], offset_minus_4); + w[36] = amd_bytealign_S (w[36], w[35], offset_minus_4); + w[35] = amd_bytealign_S (w[35], w[34], offset_minus_4); + w[34] = amd_bytealign_S (w[34], w[33], offset_minus_4); + w[33] = amd_bytealign_S (w[33], w[32], offset_minus_4); + w[32] = amd_bytealign_S (w[32], w[31], offset_minus_4); + w[31] = amd_bytealign_S (w[31], w[30], offset_minus_4); + w[30] = amd_bytealign_S (w[30], w[29], offset_minus_4); + w[29] = amd_bytealign_S (w[29], w[28], offset_minus_4); + w[28] = amd_bytealign_S (w[28], w[27], offset_minus_4); + w[27] = amd_bytealign_S (w[27], w[26], offset_minus_4); + w[26] = amd_bytealign_S (w[26], w[25], offset_minus_4); + w[25] = amd_bytealign_S (w[25], w[24], offset_minus_4); + w[24] = amd_bytealign_S (w[24], w[23], offset_minus_4); + w[23] = amd_bytealign_S (w[23], w[22], offset_minus_4); + w[22] = amd_bytealign_S (w[22], w[21], offset_minus_4); + w[21] = amd_bytealign_S (w[21], w[20], offset_minus_4); + w[20] = amd_bytealign_S (w[20], w[19], offset_minus_4); + w[19] = amd_bytealign_S (w[19], w[18], offset_minus_4); + w[18] = amd_bytealign_S (w[18], w[17], offset_minus_4); + w[17] = amd_bytealign_S (w[17], w[16], offset_minus_4); + w[16] = amd_bytealign_S (w[16], w[15], offset_minus_4); + w[15] = amd_bytealign_S (w[15], w[14], offset_minus_4); + w[14] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[13] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[12] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[11] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[10] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[ 9] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[ 8] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[ 7] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[ 6] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[ 5] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[ 4] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[ 3] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[ 2] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[ 1] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[ 0] = amd_bytealign_S (w[ 0], 0, offset_minus_4); break; - - case 1: - w3[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); - w3[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w2[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w2[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w1[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w1[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w0[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w0[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w0[1] = w0[2]; - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - + case 1: + w[63] = amd_bytealign_S (w[62], w[61], offset_minus_4); + w[62] = amd_bytealign_S (w[61], w[60], offset_minus_4); + w[61] = amd_bytealign_S (w[60], w[59], offset_minus_4); + w[60] = amd_bytealign_S (w[59], w[58], offset_minus_4); + w[59] = amd_bytealign_S (w[58], w[57], offset_minus_4); + w[58] = amd_bytealign_S (w[57], w[56], offset_minus_4); + w[57] = amd_bytealign_S (w[56], w[55], offset_minus_4); + w[56] = amd_bytealign_S (w[55], w[54], offset_minus_4); + w[55] = amd_bytealign_S (w[54], w[53], offset_minus_4); + w[54] = amd_bytealign_S (w[53], w[52], offset_minus_4); + w[53] = amd_bytealign_S (w[52], w[51], offset_minus_4); + w[52] = amd_bytealign_S (w[51], w[50], offset_minus_4); + w[51] = amd_bytealign_S (w[50], w[49], offset_minus_4); + w[50] = amd_bytealign_S (w[49], w[48], offset_minus_4); + w[49] = amd_bytealign_S (w[48], w[47], offset_minus_4); + w[48] = amd_bytealign_S (w[47], w[46], offset_minus_4); + w[47] = amd_bytealign_S (w[46], w[45], offset_minus_4); + w[46] = amd_bytealign_S (w[45], w[44], offset_minus_4); + w[45] = amd_bytealign_S (w[44], w[43], offset_minus_4); + w[44] = amd_bytealign_S (w[43], w[42], offset_minus_4); + w[43] = amd_bytealign_S (w[42], w[41], offset_minus_4); + w[42] = amd_bytealign_S (w[41], w[40], offset_minus_4); + w[41] = amd_bytealign_S (w[40], w[39], offset_minus_4); + w[40] = amd_bytealign_S (w[39], w[38], offset_minus_4); + w[39] = amd_bytealign_S (w[38], w[37], offset_minus_4); + w[38] = amd_bytealign_S (w[37], w[36], offset_minus_4); + w[37] = amd_bytealign_S (w[36], w[35], offset_minus_4); + w[36] = amd_bytealign_S (w[35], w[34], offset_minus_4); + w[35] = amd_bytealign_S (w[34], w[33], offset_minus_4); + w[34] = amd_bytealign_S (w[33], w[32], offset_minus_4); + w[33] = amd_bytealign_S (w[32], w[31], offset_minus_4); + w[32] = amd_bytealign_S (w[31], w[30], offset_minus_4); + w[31] = amd_bytealign_S (w[30], w[29], offset_minus_4); + w[30] = amd_bytealign_S (w[29], w[28], offset_minus_4); + w[29] = amd_bytealign_S (w[28], w[27], offset_minus_4); + w[28] = amd_bytealign_S (w[27], w[26], offset_minus_4); + w[27] = amd_bytealign_S (w[26], w[25], offset_minus_4); + w[26] = amd_bytealign_S (w[25], w[24], offset_minus_4); + w[25] = amd_bytealign_S (w[24], w[23], offset_minus_4); + w[24] = amd_bytealign_S (w[23], w[22], offset_minus_4); + w[23] = amd_bytealign_S (w[22], w[21], offset_minus_4); + w[22] = amd_bytealign_S (w[21], w[20], offset_minus_4); + w[21] = amd_bytealign_S (w[20], w[19], offset_minus_4); + w[20] = amd_bytealign_S (w[19], w[18], offset_minus_4); + w[19] = amd_bytealign_S (w[18], w[17], offset_minus_4); + w[18] = amd_bytealign_S (w[17], w[16], offset_minus_4); + w[17] = amd_bytealign_S (w[16], w[15], offset_minus_4); + w[16] = amd_bytealign_S (w[15], w[14], offset_minus_4); + w[15] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[14] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[13] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[12] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[11] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[10] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[ 9] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[ 8] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[ 7] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[ 6] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[ 5] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[ 4] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[ 3] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[ 2] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[ 1] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[ 0] = 0; break; - - case 2: - w3[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); - w3[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w2[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w1[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w0[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w0[2] = w0[3]; - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - + case 2: + w[63] = amd_bytealign_S (w[61], w[60], offset_minus_4); + w[62] = amd_bytealign_S (w[60], w[59], offset_minus_4); + w[61] = amd_bytealign_S (w[59], w[58], offset_minus_4); + w[60] = amd_bytealign_S (w[58], w[57], offset_minus_4); + w[59] = amd_bytealign_S (w[57], w[56], offset_minus_4); + w[58] = amd_bytealign_S (w[56], w[55], offset_minus_4); + w[57] = amd_bytealign_S (w[55], w[54], offset_minus_4); + w[56] = amd_bytealign_S (w[54], w[53], offset_minus_4); + w[55] = amd_bytealign_S (w[53], w[52], offset_minus_4); + w[54] = amd_bytealign_S (w[52], w[51], offset_minus_4); + w[53] = amd_bytealign_S (w[51], w[50], offset_minus_4); + w[52] = amd_bytealign_S (w[50], w[49], offset_minus_4); + w[51] = amd_bytealign_S (w[49], w[48], offset_minus_4); + w[50] = amd_bytealign_S (w[48], w[47], offset_minus_4); + w[49] = amd_bytealign_S (w[47], w[46], offset_minus_4); + w[48] = amd_bytealign_S (w[46], w[45], offset_minus_4); + w[47] = amd_bytealign_S (w[45], w[44], offset_minus_4); + w[46] = amd_bytealign_S (w[44], w[43], offset_minus_4); + w[45] = amd_bytealign_S (w[43], w[42], offset_minus_4); + w[44] = amd_bytealign_S (w[42], w[41], offset_minus_4); + w[43] = amd_bytealign_S (w[41], w[40], offset_minus_4); + w[42] = amd_bytealign_S (w[40], w[39], offset_minus_4); + w[41] = amd_bytealign_S (w[39], w[38], offset_minus_4); + w[40] = amd_bytealign_S (w[38], w[37], offset_minus_4); + w[39] = amd_bytealign_S (w[37], w[36], offset_minus_4); + w[38] = amd_bytealign_S (w[36], w[35], offset_minus_4); + w[37] = amd_bytealign_S (w[35], w[34], offset_minus_4); + w[36] = amd_bytealign_S (w[34], w[33], offset_minus_4); + w[35] = amd_bytealign_S (w[33], w[32], offset_minus_4); + w[34] = amd_bytealign_S (w[32], w[31], offset_minus_4); + w[33] = amd_bytealign_S (w[31], w[30], offset_minus_4); + w[32] = amd_bytealign_S (w[30], w[29], offset_minus_4); + w[31] = amd_bytealign_S (w[29], w[28], offset_minus_4); + w[30] = amd_bytealign_S (w[28], w[27], offset_minus_4); + w[29] = amd_bytealign_S (w[27], w[26], offset_minus_4); + w[28] = amd_bytealign_S (w[26], w[25], offset_minus_4); + w[27] = amd_bytealign_S (w[25], w[24], offset_minus_4); + w[26] = amd_bytealign_S (w[24], w[23], offset_minus_4); + w[25] = amd_bytealign_S (w[23], w[22], offset_minus_4); + w[24] = amd_bytealign_S (w[22], w[21], offset_minus_4); + w[23] = amd_bytealign_S (w[21], w[20], offset_minus_4); + w[22] = amd_bytealign_S (w[20], w[19], offset_minus_4); + w[21] = amd_bytealign_S (w[19], w[18], offset_minus_4); + w[20] = amd_bytealign_S (w[18], w[17], offset_minus_4); + w[19] = amd_bytealign_S (w[17], w[16], offset_minus_4); + w[18] = amd_bytealign_S (w[16], w[15], offset_minus_4); + w[17] = amd_bytealign_S (w[15], w[14], offset_minus_4); + w[16] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[15] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[14] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[13] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[12] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[11] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[10] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[ 9] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[ 8] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[ 7] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[ 6] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[ 5] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[ 4] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[ 3] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[ 2] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[ 1] = 0; + w[ 0] = 0; break; - - case 3: - w3[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); - w3[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w2[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w1[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w0[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w0[3] = w1[0]; - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - + case 3: + w[63] = amd_bytealign_S (w[60], w[59], offset_minus_4); + w[62] = amd_bytealign_S (w[59], w[58], offset_minus_4); + w[61] = amd_bytealign_S (w[58], w[57], offset_minus_4); + w[60] = amd_bytealign_S (w[57], w[56], offset_minus_4); + w[59] = amd_bytealign_S (w[56], w[55], offset_minus_4); + w[58] = amd_bytealign_S (w[55], w[54], offset_minus_4); + w[57] = amd_bytealign_S (w[54], w[53], offset_minus_4); + w[56] = amd_bytealign_S (w[53], w[52], offset_minus_4); + w[55] = amd_bytealign_S (w[52], w[51], offset_minus_4); + w[54] = amd_bytealign_S (w[51], w[50], offset_minus_4); + w[53] = amd_bytealign_S (w[50], w[49], offset_minus_4); + w[52] = amd_bytealign_S (w[49], w[48], offset_minus_4); + w[51] = amd_bytealign_S (w[48], w[47], offset_minus_4); + w[50] = amd_bytealign_S (w[47], w[46], offset_minus_4); + w[49] = amd_bytealign_S (w[46], w[45], offset_minus_4); + w[48] = amd_bytealign_S (w[45], w[44], offset_minus_4); + w[47] = amd_bytealign_S (w[44], w[43], offset_minus_4); + w[46] = amd_bytealign_S (w[43], w[42], offset_minus_4); + w[45] = amd_bytealign_S (w[42], w[41], offset_minus_4); + w[44] = amd_bytealign_S (w[41], w[40], offset_minus_4); + w[43] = amd_bytealign_S (w[40], w[39], offset_minus_4); + w[42] = amd_bytealign_S (w[39], w[38], offset_minus_4); + w[41] = amd_bytealign_S (w[38], w[37], offset_minus_4); + w[40] = amd_bytealign_S (w[37], w[36], offset_minus_4); + w[39] = amd_bytealign_S (w[36], w[35], offset_minus_4); + w[38] = amd_bytealign_S (w[35], w[34], offset_minus_4); + w[37] = amd_bytealign_S (w[34], w[33], offset_minus_4); + w[36] = amd_bytealign_S (w[33], w[32], offset_minus_4); + w[35] = amd_bytealign_S (w[32], w[31], offset_minus_4); + w[34] = amd_bytealign_S (w[31], w[30], offset_minus_4); + w[33] = amd_bytealign_S (w[30], w[29], offset_minus_4); + w[32] = amd_bytealign_S (w[29], w[28], offset_minus_4); + w[31] = amd_bytealign_S (w[28], w[27], offset_minus_4); + w[30] = amd_bytealign_S (w[27], w[26], offset_minus_4); + w[29] = amd_bytealign_S (w[26], w[25], offset_minus_4); + w[28] = amd_bytealign_S (w[25], w[24], offset_minus_4); + w[27] = amd_bytealign_S (w[24], w[23], offset_minus_4); + w[26] = amd_bytealign_S (w[23], w[22], offset_minus_4); + w[25] = amd_bytealign_S (w[22], w[21], offset_minus_4); + w[24] = amd_bytealign_S (w[21], w[20], offset_minus_4); + w[23] = amd_bytealign_S (w[20], w[19], offset_minus_4); + w[22] = amd_bytealign_S (w[19], w[18], offset_minus_4); + w[21] = amd_bytealign_S (w[18], w[17], offset_minus_4); + w[20] = amd_bytealign_S (w[17], w[16], offset_minus_4); + w[19] = amd_bytealign_S (w[16], w[15], offset_minus_4); + w[18] = amd_bytealign_S (w[15], w[14], offset_minus_4); + w[17] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[16] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[15] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[14] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[13] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[12] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[11] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[10] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[ 9] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[ 8] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[ 7] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[ 6] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[ 5] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[ 4] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[ 3] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; break; - - case 4: - w3[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); - w3[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w2[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w1[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w1[0] = w1[1]; - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - + case 4: + w[63] = amd_bytealign_S (w[59], w[58], offset_minus_4); + w[62] = amd_bytealign_S (w[58], w[57], offset_minus_4); + w[61] = amd_bytealign_S (w[57], w[56], offset_minus_4); + w[60] = amd_bytealign_S (w[56], w[55], offset_minus_4); + w[59] = amd_bytealign_S (w[55], w[54], offset_minus_4); + w[58] = amd_bytealign_S (w[54], w[53], offset_minus_4); + w[57] = amd_bytealign_S (w[53], w[52], offset_minus_4); + w[56] = amd_bytealign_S (w[52], w[51], offset_minus_4); + w[55] = amd_bytealign_S (w[51], w[50], offset_minus_4); + w[54] = amd_bytealign_S (w[50], w[49], offset_minus_4); + w[53] = amd_bytealign_S (w[49], w[48], offset_minus_4); + w[52] = amd_bytealign_S (w[48], w[47], offset_minus_4); + w[51] = amd_bytealign_S (w[47], w[46], offset_minus_4); + w[50] = amd_bytealign_S (w[46], w[45], offset_minus_4); + w[49] = amd_bytealign_S (w[45], w[44], offset_minus_4); + w[48] = amd_bytealign_S (w[44], w[43], offset_minus_4); + w[47] = amd_bytealign_S (w[43], w[42], offset_minus_4); + w[46] = amd_bytealign_S (w[42], w[41], offset_minus_4); + w[45] = amd_bytealign_S (w[41], w[40], offset_minus_4); + w[44] = amd_bytealign_S (w[40], w[39], offset_minus_4); + w[43] = amd_bytealign_S (w[39], w[38], offset_minus_4); + w[42] = amd_bytealign_S (w[38], w[37], offset_minus_4); + w[41] = amd_bytealign_S (w[37], w[36], offset_minus_4); + w[40] = amd_bytealign_S (w[36], w[35], offset_minus_4); + w[39] = amd_bytealign_S (w[35], w[34], offset_minus_4); + w[38] = amd_bytealign_S (w[34], w[33], offset_minus_4); + w[37] = amd_bytealign_S (w[33], w[32], offset_minus_4); + w[36] = amd_bytealign_S (w[32], w[31], offset_minus_4); + w[35] = amd_bytealign_S (w[31], w[30], offset_minus_4); + w[34] = amd_bytealign_S (w[30], w[29], offset_minus_4); + w[33] = amd_bytealign_S (w[29], w[28], offset_minus_4); + w[32] = amd_bytealign_S (w[28], w[27], offset_minus_4); + w[31] = amd_bytealign_S (w[27], w[26], offset_minus_4); + w[30] = amd_bytealign_S (w[26], w[25], offset_minus_4); + w[29] = amd_bytealign_S (w[25], w[24], offset_minus_4); + w[28] = amd_bytealign_S (w[24], w[23], offset_minus_4); + w[27] = amd_bytealign_S (w[23], w[22], offset_minus_4); + w[26] = amd_bytealign_S (w[22], w[21], offset_minus_4); + w[25] = amd_bytealign_S (w[21], w[20], offset_minus_4); + w[24] = amd_bytealign_S (w[20], w[19], offset_minus_4); + w[23] = amd_bytealign_S (w[19], w[18], offset_minus_4); + w[22] = amd_bytealign_S (w[18], w[17], offset_minus_4); + w[21] = amd_bytealign_S (w[17], w[16], offset_minus_4); + w[20] = amd_bytealign_S (w[16], w[15], offset_minus_4); + w[19] = amd_bytealign_S (w[15], w[14], offset_minus_4); + w[18] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[17] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[16] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[15] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[14] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[13] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[12] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[11] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[10] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[ 9] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[ 8] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[ 7] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[ 6] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[ 5] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[ 4] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; break; - - case 5: - w3[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); - w3[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w2[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w1[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w1[1] = w1[2]; - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - + case 5: + w[63] = amd_bytealign_S (w[58], w[57], offset_minus_4); + w[62] = amd_bytealign_S (w[57], w[56], offset_minus_4); + w[61] = amd_bytealign_S (w[56], w[55], offset_minus_4); + w[60] = amd_bytealign_S (w[55], w[54], offset_minus_4); + w[59] = amd_bytealign_S (w[54], w[53], offset_minus_4); + w[58] = amd_bytealign_S (w[53], w[52], offset_minus_4); + w[57] = amd_bytealign_S (w[52], w[51], offset_minus_4); + w[56] = amd_bytealign_S (w[51], w[50], offset_minus_4); + w[55] = amd_bytealign_S (w[50], w[49], offset_minus_4); + w[54] = amd_bytealign_S (w[49], w[48], offset_minus_4); + w[53] = amd_bytealign_S (w[48], w[47], offset_minus_4); + w[52] = amd_bytealign_S (w[47], w[46], offset_minus_4); + w[51] = amd_bytealign_S (w[46], w[45], offset_minus_4); + w[50] = amd_bytealign_S (w[45], w[44], offset_minus_4); + w[49] = amd_bytealign_S (w[44], w[43], offset_minus_4); + w[48] = amd_bytealign_S (w[43], w[42], offset_minus_4); + w[47] = amd_bytealign_S (w[42], w[41], offset_minus_4); + w[46] = amd_bytealign_S (w[41], w[40], offset_minus_4); + w[45] = amd_bytealign_S (w[40], w[39], offset_minus_4); + w[44] = amd_bytealign_S (w[39], w[38], offset_minus_4); + w[43] = amd_bytealign_S (w[38], w[37], offset_minus_4); + w[42] = amd_bytealign_S (w[37], w[36], offset_minus_4); + w[41] = amd_bytealign_S (w[36], w[35], offset_minus_4); + w[40] = amd_bytealign_S (w[35], w[34], offset_minus_4); + w[39] = amd_bytealign_S (w[34], w[33], offset_minus_4); + w[38] = amd_bytealign_S (w[33], w[32], offset_minus_4); + w[37] = amd_bytealign_S (w[32], w[31], offset_minus_4); + w[36] = amd_bytealign_S (w[31], w[30], offset_minus_4); + w[35] = amd_bytealign_S (w[30], w[29], offset_minus_4); + w[34] = amd_bytealign_S (w[29], w[28], offset_minus_4); + w[33] = amd_bytealign_S (w[28], w[27], offset_minus_4); + w[32] = amd_bytealign_S (w[27], w[26], offset_minus_4); + w[31] = amd_bytealign_S (w[26], w[25], offset_minus_4); + w[30] = amd_bytealign_S (w[25], w[24], offset_minus_4); + w[29] = amd_bytealign_S (w[24], w[23], offset_minus_4); + w[28] = amd_bytealign_S (w[23], w[22], offset_minus_4); + w[27] = amd_bytealign_S (w[22], w[21], offset_minus_4); + w[26] = amd_bytealign_S (w[21], w[20], offset_minus_4); + w[25] = amd_bytealign_S (w[20], w[19], offset_minus_4); + w[24] = amd_bytealign_S (w[19], w[18], offset_minus_4); + w[23] = amd_bytealign_S (w[18], w[17], offset_minus_4); + w[22] = amd_bytealign_S (w[17], w[16], offset_minus_4); + w[21] = amd_bytealign_S (w[16], w[15], offset_minus_4); + w[20] = amd_bytealign_S (w[15], w[14], offset_minus_4); + w[19] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[18] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[17] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[16] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[15] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[14] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[13] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[12] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[11] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[10] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[ 9] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[ 8] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[ 7] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[ 6] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[ 5] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; break; - - case 6: - w3[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); - w3[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w2[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w1[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w1[2] = w1[3]; - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - + case 6: + w[63] = amd_bytealign_S (w[57], w[56], offset_minus_4); + w[62] = amd_bytealign_S (w[56], w[55], offset_minus_4); + w[61] = amd_bytealign_S (w[55], w[54], offset_minus_4); + w[60] = amd_bytealign_S (w[54], w[53], offset_minus_4); + w[59] = amd_bytealign_S (w[53], w[52], offset_minus_4); + w[58] = amd_bytealign_S (w[52], w[51], offset_minus_4); + w[57] = amd_bytealign_S (w[51], w[50], offset_minus_4); + w[56] = amd_bytealign_S (w[50], w[49], offset_minus_4); + w[55] = amd_bytealign_S (w[49], w[48], offset_minus_4); + w[54] = amd_bytealign_S (w[48], w[47], offset_minus_4); + w[53] = amd_bytealign_S (w[47], w[46], offset_minus_4); + w[52] = amd_bytealign_S (w[46], w[45], offset_minus_4); + w[51] = amd_bytealign_S (w[45], w[44], offset_minus_4); + w[50] = amd_bytealign_S (w[44], w[43], offset_minus_4); + w[49] = amd_bytealign_S (w[43], w[42], offset_minus_4); + w[48] = amd_bytealign_S (w[42], w[41], offset_minus_4); + w[47] = amd_bytealign_S (w[41], w[40], offset_minus_4); + w[46] = amd_bytealign_S (w[40], w[39], offset_minus_4); + w[45] = amd_bytealign_S (w[39], w[38], offset_minus_4); + w[44] = amd_bytealign_S (w[38], w[37], offset_minus_4); + w[43] = amd_bytealign_S (w[37], w[36], offset_minus_4); + w[42] = amd_bytealign_S (w[36], w[35], offset_minus_4); + w[41] = amd_bytealign_S (w[35], w[34], offset_minus_4); + w[40] = amd_bytealign_S (w[34], w[33], offset_minus_4); + w[39] = amd_bytealign_S (w[33], w[32], offset_minus_4); + w[38] = amd_bytealign_S (w[32], w[31], offset_minus_4); + w[37] = amd_bytealign_S (w[31], w[30], offset_minus_4); + w[36] = amd_bytealign_S (w[30], w[29], offset_minus_4); + w[35] = amd_bytealign_S (w[29], w[28], offset_minus_4); + w[34] = amd_bytealign_S (w[28], w[27], offset_minus_4); + w[33] = amd_bytealign_S (w[27], w[26], offset_minus_4); + w[32] = amd_bytealign_S (w[26], w[25], offset_minus_4); + w[31] = amd_bytealign_S (w[25], w[24], offset_minus_4); + w[30] = amd_bytealign_S (w[24], w[23], offset_minus_4); + w[29] = amd_bytealign_S (w[23], w[22], offset_minus_4); + w[28] = amd_bytealign_S (w[22], w[21], offset_minus_4); + w[27] = amd_bytealign_S (w[21], w[20], offset_minus_4); + w[26] = amd_bytealign_S (w[20], w[19], offset_minus_4); + w[25] = amd_bytealign_S (w[19], w[18], offset_minus_4); + w[24] = amd_bytealign_S (w[18], w[17], offset_minus_4); + w[23] = amd_bytealign_S (w[17], w[16], offset_minus_4); + w[22] = amd_bytealign_S (w[16], w[15], offset_minus_4); + w[21] = amd_bytealign_S (w[15], w[14], offset_minus_4); + w[20] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[19] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[18] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[17] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[16] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[15] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[14] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[13] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[12] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[11] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[10] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[ 9] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[ 8] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[ 7] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[ 6] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; break; - - case 7: - w3[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); - w3[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w2[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w1[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w1[3] = w2[0]; - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - + case 7: + w[63] = amd_bytealign_S (w[56], w[55], offset_minus_4); + w[62] = amd_bytealign_S (w[55], w[54], offset_minus_4); + w[61] = amd_bytealign_S (w[54], w[53], offset_minus_4); + w[60] = amd_bytealign_S (w[53], w[52], offset_minus_4); + w[59] = amd_bytealign_S (w[52], w[51], offset_minus_4); + w[58] = amd_bytealign_S (w[51], w[50], offset_minus_4); + w[57] = amd_bytealign_S (w[50], w[49], offset_minus_4); + w[56] = amd_bytealign_S (w[49], w[48], offset_minus_4); + w[55] = amd_bytealign_S (w[48], w[47], offset_minus_4); + w[54] = amd_bytealign_S (w[47], w[46], offset_minus_4); + w[53] = amd_bytealign_S (w[46], w[45], offset_minus_4); + w[52] = amd_bytealign_S (w[45], w[44], offset_minus_4); + w[51] = amd_bytealign_S (w[44], w[43], offset_minus_4); + w[50] = amd_bytealign_S (w[43], w[42], offset_minus_4); + w[49] = amd_bytealign_S (w[42], w[41], offset_minus_4); + w[48] = amd_bytealign_S (w[41], w[40], offset_minus_4); + w[47] = amd_bytealign_S (w[40], w[39], offset_minus_4); + w[46] = amd_bytealign_S (w[39], w[38], offset_minus_4); + w[45] = amd_bytealign_S (w[38], w[37], offset_minus_4); + w[44] = amd_bytealign_S (w[37], w[36], offset_minus_4); + w[43] = amd_bytealign_S (w[36], w[35], offset_minus_4); + w[42] = amd_bytealign_S (w[35], w[34], offset_minus_4); + w[41] = amd_bytealign_S (w[34], w[33], offset_minus_4); + w[40] = amd_bytealign_S (w[33], w[32], offset_minus_4); + w[39] = amd_bytealign_S (w[32], w[31], offset_minus_4); + w[38] = amd_bytealign_S (w[31], w[30], offset_minus_4); + w[37] = amd_bytealign_S (w[30], w[29], offset_minus_4); + w[36] = amd_bytealign_S (w[29], w[28], offset_minus_4); + w[35] = amd_bytealign_S (w[28], w[27], offset_minus_4); + w[34] = amd_bytealign_S (w[27], w[26], offset_minus_4); + w[33] = amd_bytealign_S (w[26], w[25], offset_minus_4); + w[32] = amd_bytealign_S (w[25], w[24], offset_minus_4); + w[31] = amd_bytealign_S (w[24], w[23], offset_minus_4); + w[30] = amd_bytealign_S (w[23], w[22], offset_minus_4); + w[29] = amd_bytealign_S (w[22], w[21], offset_minus_4); + w[28] = amd_bytealign_S (w[21], w[20], offset_minus_4); + w[27] = amd_bytealign_S (w[20], w[19], offset_minus_4); + w[26] = amd_bytealign_S (w[19], w[18], offset_minus_4); + w[25] = amd_bytealign_S (w[18], w[17], offset_minus_4); + w[24] = amd_bytealign_S (w[17], w[16], offset_minus_4); + w[23] = amd_bytealign_S (w[16], w[15], offset_minus_4); + w[22] = amd_bytealign_S (w[15], w[14], offset_minus_4); + w[21] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[20] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[19] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[18] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[17] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[16] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[15] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[14] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[13] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[12] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[11] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[10] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[ 9] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[ 8] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[ 7] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; break; - - case 8: - w3[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); - w3[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w2[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w2[0] = w2[1]; - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - + case 8: + w[63] = amd_bytealign_S (w[55], w[54], offset_minus_4); + w[62] = amd_bytealign_S (w[54], w[53], offset_minus_4); + w[61] = amd_bytealign_S (w[53], w[52], offset_minus_4); + w[60] = amd_bytealign_S (w[52], w[51], offset_minus_4); + w[59] = amd_bytealign_S (w[51], w[50], offset_minus_4); + w[58] = amd_bytealign_S (w[50], w[49], offset_minus_4); + w[57] = amd_bytealign_S (w[49], w[48], offset_minus_4); + w[56] = amd_bytealign_S (w[48], w[47], offset_minus_4); + w[55] = amd_bytealign_S (w[47], w[46], offset_minus_4); + w[54] = amd_bytealign_S (w[46], w[45], offset_minus_4); + w[53] = amd_bytealign_S (w[45], w[44], offset_minus_4); + w[52] = amd_bytealign_S (w[44], w[43], offset_minus_4); + w[51] = amd_bytealign_S (w[43], w[42], offset_minus_4); + w[50] = amd_bytealign_S (w[42], w[41], offset_minus_4); + w[49] = amd_bytealign_S (w[41], w[40], offset_minus_4); + w[48] = amd_bytealign_S (w[40], w[39], offset_minus_4); + w[47] = amd_bytealign_S (w[39], w[38], offset_minus_4); + w[46] = amd_bytealign_S (w[38], w[37], offset_minus_4); + w[45] = amd_bytealign_S (w[37], w[36], offset_minus_4); + w[44] = amd_bytealign_S (w[36], w[35], offset_minus_4); + w[43] = amd_bytealign_S (w[35], w[34], offset_minus_4); + w[42] = amd_bytealign_S (w[34], w[33], offset_minus_4); + w[41] = amd_bytealign_S (w[33], w[32], offset_minus_4); + w[40] = amd_bytealign_S (w[32], w[31], offset_minus_4); + w[39] = amd_bytealign_S (w[31], w[30], offset_minus_4); + w[38] = amd_bytealign_S (w[30], w[29], offset_minus_4); + w[37] = amd_bytealign_S (w[29], w[28], offset_minus_4); + w[36] = amd_bytealign_S (w[28], w[27], offset_minus_4); + w[35] = amd_bytealign_S (w[27], w[26], offset_minus_4); + w[34] = amd_bytealign_S (w[26], w[25], offset_minus_4); + w[33] = amd_bytealign_S (w[25], w[24], offset_minus_4); + w[32] = amd_bytealign_S (w[24], w[23], offset_minus_4); + w[31] = amd_bytealign_S (w[23], w[22], offset_minus_4); + w[30] = amd_bytealign_S (w[22], w[21], offset_minus_4); + w[29] = amd_bytealign_S (w[21], w[20], offset_minus_4); + w[28] = amd_bytealign_S (w[20], w[19], offset_minus_4); + w[27] = amd_bytealign_S (w[19], w[18], offset_minus_4); + w[26] = amd_bytealign_S (w[18], w[17], offset_minus_4); + w[25] = amd_bytealign_S (w[17], w[16], offset_minus_4); + w[24] = amd_bytealign_S (w[16], w[15], offset_minus_4); + w[23] = amd_bytealign_S (w[15], w[14], offset_minus_4); + w[22] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[21] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[20] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[19] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[18] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[17] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[16] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[15] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[14] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[13] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[12] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[11] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[10] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[ 9] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[ 8] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; break; - - case 9: - w3[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); - w3[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w2[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w2[1] = w2[2]; - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - + case 9: + w[63] = amd_bytealign_S (w[54], w[53], offset_minus_4); + w[62] = amd_bytealign_S (w[53], w[52], offset_minus_4); + w[61] = amd_bytealign_S (w[52], w[51], offset_minus_4); + w[60] = amd_bytealign_S (w[51], w[50], offset_minus_4); + w[59] = amd_bytealign_S (w[50], w[49], offset_minus_4); + w[58] = amd_bytealign_S (w[49], w[48], offset_minus_4); + w[57] = amd_bytealign_S (w[48], w[47], offset_minus_4); + w[56] = amd_bytealign_S (w[47], w[46], offset_minus_4); + w[55] = amd_bytealign_S (w[46], w[45], offset_minus_4); + w[54] = amd_bytealign_S (w[45], w[44], offset_minus_4); + w[53] = amd_bytealign_S (w[44], w[43], offset_minus_4); + w[52] = amd_bytealign_S (w[43], w[42], offset_minus_4); + w[51] = amd_bytealign_S (w[42], w[41], offset_minus_4); + w[50] = amd_bytealign_S (w[41], w[40], offset_minus_4); + w[49] = amd_bytealign_S (w[40], w[39], offset_minus_4); + w[48] = amd_bytealign_S (w[39], w[38], offset_minus_4); + w[47] = amd_bytealign_S (w[38], w[37], offset_minus_4); + w[46] = amd_bytealign_S (w[37], w[36], offset_minus_4); + w[45] = amd_bytealign_S (w[36], w[35], offset_minus_4); + w[44] = amd_bytealign_S (w[35], w[34], offset_minus_4); + w[43] = amd_bytealign_S (w[34], w[33], offset_minus_4); + w[42] = amd_bytealign_S (w[33], w[32], offset_minus_4); + w[41] = amd_bytealign_S (w[32], w[31], offset_minus_4); + w[40] = amd_bytealign_S (w[31], w[30], offset_minus_4); + w[39] = amd_bytealign_S (w[30], w[29], offset_minus_4); + w[38] = amd_bytealign_S (w[29], w[28], offset_minus_4); + w[37] = amd_bytealign_S (w[28], w[27], offset_minus_4); + w[36] = amd_bytealign_S (w[27], w[26], offset_minus_4); + w[35] = amd_bytealign_S (w[26], w[25], offset_minus_4); + w[34] = amd_bytealign_S (w[25], w[24], offset_minus_4); + w[33] = amd_bytealign_S (w[24], w[23], offset_minus_4); + w[32] = amd_bytealign_S (w[23], w[22], offset_minus_4); + w[31] = amd_bytealign_S (w[22], w[21], offset_minus_4); + w[30] = amd_bytealign_S (w[21], w[20], offset_minus_4); + w[29] = amd_bytealign_S (w[20], w[19], offset_minus_4); + w[28] = amd_bytealign_S (w[19], w[18], offset_minus_4); + w[27] = amd_bytealign_S (w[18], w[17], offset_minus_4); + w[26] = amd_bytealign_S (w[17], w[16], offset_minus_4); + w[25] = amd_bytealign_S (w[16], w[15], offset_minus_4); + w[24] = amd_bytealign_S (w[15], w[14], offset_minus_4); + w[23] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[22] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[21] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[20] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[19] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[18] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[17] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[16] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[15] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[14] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[13] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[12] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[11] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[10] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[ 9] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; break; - case 10: - w3[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); - w3[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w2[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w2[2] = w2[3]; - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - + w[63] = amd_bytealign_S (w[53], w[52], offset_minus_4); + w[62] = amd_bytealign_S (w[52], w[51], offset_minus_4); + w[61] = amd_bytealign_S (w[51], w[50], offset_minus_4); + w[60] = amd_bytealign_S (w[50], w[49], offset_minus_4); + w[59] = amd_bytealign_S (w[49], w[48], offset_minus_4); + w[58] = amd_bytealign_S (w[48], w[47], offset_minus_4); + w[57] = amd_bytealign_S (w[47], w[46], offset_minus_4); + w[56] = amd_bytealign_S (w[46], w[45], offset_minus_4); + w[55] = amd_bytealign_S (w[45], w[44], offset_minus_4); + w[54] = amd_bytealign_S (w[44], w[43], offset_minus_4); + w[53] = amd_bytealign_S (w[43], w[42], offset_minus_4); + w[52] = amd_bytealign_S (w[42], w[41], offset_minus_4); + w[51] = amd_bytealign_S (w[41], w[40], offset_minus_4); + w[50] = amd_bytealign_S (w[40], w[39], offset_minus_4); + w[49] = amd_bytealign_S (w[39], w[38], offset_minus_4); + w[48] = amd_bytealign_S (w[38], w[37], offset_minus_4); + w[47] = amd_bytealign_S (w[37], w[36], offset_minus_4); + w[46] = amd_bytealign_S (w[36], w[35], offset_minus_4); + w[45] = amd_bytealign_S (w[35], w[34], offset_minus_4); + w[44] = amd_bytealign_S (w[34], w[33], offset_minus_4); + w[43] = amd_bytealign_S (w[33], w[32], offset_minus_4); + w[42] = amd_bytealign_S (w[32], w[31], offset_minus_4); + w[41] = amd_bytealign_S (w[31], w[30], offset_minus_4); + w[40] = amd_bytealign_S (w[30], w[29], offset_minus_4); + w[39] = amd_bytealign_S (w[29], w[28], offset_minus_4); + w[38] = amd_bytealign_S (w[28], w[27], offset_minus_4); + w[37] = amd_bytealign_S (w[27], w[26], offset_minus_4); + w[36] = amd_bytealign_S (w[26], w[25], offset_minus_4); + w[35] = amd_bytealign_S (w[25], w[24], offset_minus_4); + w[34] = amd_bytealign_S (w[24], w[23], offset_minus_4); + w[33] = amd_bytealign_S (w[23], w[22], offset_minus_4); + w[32] = amd_bytealign_S (w[22], w[21], offset_minus_4); + w[31] = amd_bytealign_S (w[21], w[20], offset_minus_4); + w[30] = amd_bytealign_S (w[20], w[19], offset_minus_4); + w[29] = amd_bytealign_S (w[19], w[18], offset_minus_4); + w[28] = amd_bytealign_S (w[18], w[17], offset_minus_4); + w[27] = amd_bytealign_S (w[17], w[16], offset_minus_4); + w[26] = amd_bytealign_S (w[16], w[15], offset_minus_4); + w[25] = amd_bytealign_S (w[15], w[14], offset_minus_4); + w[24] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[23] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[22] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[21] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[20] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[19] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[18] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[17] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[16] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[15] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[14] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[13] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[12] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[11] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[10] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; break; - case 11: - w3[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); - w3[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w2[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w2[3] = w3[0]; - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - + w[63] = amd_bytealign_S (w[52], w[51], offset_minus_4); + w[62] = amd_bytealign_S (w[51], w[50], offset_minus_4); + w[61] = amd_bytealign_S (w[50], w[49], offset_minus_4); + w[60] = amd_bytealign_S (w[49], w[48], offset_minus_4); + w[59] = amd_bytealign_S (w[48], w[47], offset_minus_4); + w[58] = amd_bytealign_S (w[47], w[46], offset_minus_4); + w[57] = amd_bytealign_S (w[46], w[45], offset_minus_4); + w[56] = amd_bytealign_S (w[45], w[44], offset_minus_4); + w[55] = amd_bytealign_S (w[44], w[43], offset_minus_4); + w[54] = amd_bytealign_S (w[43], w[42], offset_minus_4); + w[53] = amd_bytealign_S (w[42], w[41], offset_minus_4); + w[52] = amd_bytealign_S (w[41], w[40], offset_minus_4); + w[51] = amd_bytealign_S (w[40], w[39], offset_minus_4); + w[50] = amd_bytealign_S (w[39], w[38], offset_minus_4); + w[49] = amd_bytealign_S (w[38], w[37], offset_minus_4); + w[48] = amd_bytealign_S (w[37], w[36], offset_minus_4); + w[47] = amd_bytealign_S (w[36], w[35], offset_minus_4); + w[46] = amd_bytealign_S (w[35], w[34], offset_minus_4); + w[45] = amd_bytealign_S (w[34], w[33], offset_minus_4); + w[44] = amd_bytealign_S (w[33], w[32], offset_minus_4); + w[43] = amd_bytealign_S (w[32], w[31], offset_minus_4); + w[42] = amd_bytealign_S (w[31], w[30], offset_minus_4); + w[41] = amd_bytealign_S (w[30], w[29], offset_minus_4); + w[40] = amd_bytealign_S (w[29], w[28], offset_minus_4); + w[39] = amd_bytealign_S (w[28], w[27], offset_minus_4); + w[38] = amd_bytealign_S (w[27], w[26], offset_minus_4); + w[37] = amd_bytealign_S (w[26], w[25], offset_minus_4); + w[36] = amd_bytealign_S (w[25], w[24], offset_minus_4); + w[35] = amd_bytealign_S (w[24], w[23], offset_minus_4); + w[34] = amd_bytealign_S (w[23], w[22], offset_minus_4); + w[33] = amd_bytealign_S (w[22], w[21], offset_minus_4); + w[32] = amd_bytealign_S (w[21], w[20], offset_minus_4); + w[31] = amd_bytealign_S (w[20], w[19], offset_minus_4); + w[30] = amd_bytealign_S (w[19], w[18], offset_minus_4); + w[29] = amd_bytealign_S (w[18], w[17], offset_minus_4); + w[28] = amd_bytealign_S (w[17], w[16], offset_minus_4); + w[27] = amd_bytealign_S (w[16], w[15], offset_minus_4); + w[26] = amd_bytealign_S (w[15], w[14], offset_minus_4); + w[25] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[24] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[23] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[22] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[21] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[20] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[19] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[18] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[17] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[16] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[15] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[14] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[13] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[12] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[11] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; break; - case 12: - w3[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); - w3[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w3[0] = w3[1]; - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - + w[63] = amd_bytealign_S (w[51], w[50], offset_minus_4); + w[62] = amd_bytealign_S (w[50], w[49], offset_minus_4); + w[61] = amd_bytealign_S (w[49], w[48], offset_minus_4); + w[60] = amd_bytealign_S (w[48], w[47], offset_minus_4); + w[59] = amd_bytealign_S (w[47], w[46], offset_minus_4); + w[58] = amd_bytealign_S (w[46], w[45], offset_minus_4); + w[57] = amd_bytealign_S (w[45], w[44], offset_minus_4); + w[56] = amd_bytealign_S (w[44], w[43], offset_minus_4); + w[55] = amd_bytealign_S (w[43], w[42], offset_minus_4); + w[54] = amd_bytealign_S (w[42], w[41], offset_minus_4); + w[53] = amd_bytealign_S (w[41], w[40], offset_minus_4); + w[52] = amd_bytealign_S (w[40], w[39], offset_minus_4); + w[51] = amd_bytealign_S (w[39], w[38], offset_minus_4); + w[50] = amd_bytealign_S (w[38], w[37], offset_minus_4); + w[49] = amd_bytealign_S (w[37], w[36], offset_minus_4); + w[48] = amd_bytealign_S (w[36], w[35], offset_minus_4); + w[47] = amd_bytealign_S (w[35], w[34], offset_minus_4); + w[46] = amd_bytealign_S (w[34], w[33], offset_minus_4); + w[45] = amd_bytealign_S (w[33], w[32], offset_minus_4); + w[44] = amd_bytealign_S (w[32], w[31], offset_minus_4); + w[43] = amd_bytealign_S (w[31], w[30], offset_minus_4); + w[42] = amd_bytealign_S (w[30], w[29], offset_minus_4); + w[41] = amd_bytealign_S (w[29], w[28], offset_minus_4); + w[40] = amd_bytealign_S (w[28], w[27], offset_minus_4); + w[39] = amd_bytealign_S (w[27], w[26], offset_minus_4); + w[38] = amd_bytealign_S (w[26], w[25], offset_minus_4); + w[37] = amd_bytealign_S (w[25], w[24], offset_minus_4); + w[36] = amd_bytealign_S (w[24], w[23], offset_minus_4); + w[35] = amd_bytealign_S (w[23], w[22], offset_minus_4); + w[34] = amd_bytealign_S (w[22], w[21], offset_minus_4); + w[33] = amd_bytealign_S (w[21], w[20], offset_minus_4); + w[32] = amd_bytealign_S (w[20], w[19], offset_minus_4); + w[31] = amd_bytealign_S (w[19], w[18], offset_minus_4); + w[30] = amd_bytealign_S (w[18], w[17], offset_minus_4); + w[29] = amd_bytealign_S (w[17], w[16], offset_minus_4); + w[28] = amd_bytealign_S (w[16], w[15], offset_minus_4); + w[27] = amd_bytealign_S (w[15], w[14], offset_minus_4); + w[26] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[25] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[24] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[23] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[22] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[21] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[20] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[19] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[18] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[17] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[16] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[15] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[14] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[13] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[12] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; break; - case 13: - w3[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); - w3[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w3[0] = 0; - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w3[1] = w3[2]; - w3[2] = w3[3]; - w3[3] = 0; - } - + w[63] = amd_bytealign_S (w[50], w[49], offset_minus_4); + w[62] = amd_bytealign_S (w[49], w[48], offset_minus_4); + w[61] = amd_bytealign_S (w[48], w[47], offset_minus_4); + w[60] = amd_bytealign_S (w[47], w[46], offset_minus_4); + w[59] = amd_bytealign_S (w[46], w[45], offset_minus_4); + w[58] = amd_bytealign_S (w[45], w[44], offset_minus_4); + w[57] = amd_bytealign_S (w[44], w[43], offset_minus_4); + w[56] = amd_bytealign_S (w[43], w[42], offset_minus_4); + w[55] = amd_bytealign_S (w[42], w[41], offset_minus_4); + w[54] = amd_bytealign_S (w[41], w[40], offset_minus_4); + w[53] = amd_bytealign_S (w[40], w[39], offset_minus_4); + w[52] = amd_bytealign_S (w[39], w[38], offset_minus_4); + w[51] = amd_bytealign_S (w[38], w[37], offset_minus_4); + w[50] = amd_bytealign_S (w[37], w[36], offset_minus_4); + w[49] = amd_bytealign_S (w[36], w[35], offset_minus_4); + w[48] = amd_bytealign_S (w[35], w[34], offset_minus_4); + w[47] = amd_bytealign_S (w[34], w[33], offset_minus_4); + w[46] = amd_bytealign_S (w[33], w[32], offset_minus_4); + w[45] = amd_bytealign_S (w[32], w[31], offset_minus_4); + w[44] = amd_bytealign_S (w[31], w[30], offset_minus_4); + w[43] = amd_bytealign_S (w[30], w[29], offset_minus_4); + w[42] = amd_bytealign_S (w[29], w[28], offset_minus_4); + w[41] = amd_bytealign_S (w[28], w[27], offset_minus_4); + w[40] = amd_bytealign_S (w[27], w[26], offset_minus_4); + w[39] = amd_bytealign_S (w[26], w[25], offset_minus_4); + w[38] = amd_bytealign_S (w[25], w[24], offset_minus_4); + w[37] = amd_bytealign_S (w[24], w[23], offset_minus_4); + w[36] = amd_bytealign_S (w[23], w[22], offset_minus_4); + w[35] = amd_bytealign_S (w[22], w[21], offset_minus_4); + w[34] = amd_bytealign_S (w[21], w[20], offset_minus_4); + w[33] = amd_bytealign_S (w[20], w[19], offset_minus_4); + w[32] = amd_bytealign_S (w[19], w[18], offset_minus_4); + w[31] = amd_bytealign_S (w[18], w[17], offset_minus_4); + w[30] = amd_bytealign_S (w[17], w[16], offset_minus_4); + w[29] = amd_bytealign_S (w[16], w[15], offset_minus_4); + w[28] = amd_bytealign_S (w[15], w[14], offset_minus_4); + w[27] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[26] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[25] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[24] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[23] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[22] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[21] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[20] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[19] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[18] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[17] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[16] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[15] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[14] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[13] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; break; - case 14: - w3[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); - w3[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w3[1] = 0; - w3[0] = 0; - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w3[2] = w3[3]; - w3[3] = 0; - } - + w[63] = amd_bytealign_S (w[49], w[48], offset_minus_4); + w[62] = amd_bytealign_S (w[48], w[47], offset_minus_4); + w[61] = amd_bytealign_S (w[47], w[46], offset_minus_4); + w[60] = amd_bytealign_S (w[46], w[45], offset_minus_4); + w[59] = amd_bytealign_S (w[45], w[44], offset_minus_4); + w[58] = amd_bytealign_S (w[44], w[43], offset_minus_4); + w[57] = amd_bytealign_S (w[43], w[42], offset_minus_4); + w[56] = amd_bytealign_S (w[42], w[41], offset_minus_4); + w[55] = amd_bytealign_S (w[41], w[40], offset_minus_4); + w[54] = amd_bytealign_S (w[40], w[39], offset_minus_4); + w[53] = amd_bytealign_S (w[39], w[38], offset_minus_4); + w[52] = amd_bytealign_S (w[38], w[37], offset_minus_4); + w[51] = amd_bytealign_S (w[37], w[36], offset_minus_4); + w[50] = amd_bytealign_S (w[36], w[35], offset_minus_4); + w[49] = amd_bytealign_S (w[35], w[34], offset_minus_4); + w[48] = amd_bytealign_S (w[34], w[33], offset_minus_4); + w[47] = amd_bytealign_S (w[33], w[32], offset_minus_4); + w[46] = amd_bytealign_S (w[32], w[31], offset_minus_4); + w[45] = amd_bytealign_S (w[31], w[30], offset_minus_4); + w[44] = amd_bytealign_S (w[30], w[29], offset_minus_4); + w[43] = amd_bytealign_S (w[29], w[28], offset_minus_4); + w[42] = amd_bytealign_S (w[28], w[27], offset_minus_4); + w[41] = amd_bytealign_S (w[27], w[26], offset_minus_4); + w[40] = amd_bytealign_S (w[26], w[25], offset_minus_4); + w[39] = amd_bytealign_S (w[25], w[24], offset_minus_4); + w[38] = amd_bytealign_S (w[24], w[23], offset_minus_4); + w[37] = amd_bytealign_S (w[23], w[22], offset_minus_4); + w[36] = amd_bytealign_S (w[22], w[21], offset_minus_4); + w[35] = amd_bytealign_S (w[21], w[20], offset_minus_4); + w[34] = amd_bytealign_S (w[20], w[19], offset_minus_4); + w[33] = amd_bytealign_S (w[19], w[18], offset_minus_4); + w[32] = amd_bytealign_S (w[18], w[17], offset_minus_4); + w[31] = amd_bytealign_S (w[17], w[16], offset_minus_4); + w[30] = amd_bytealign_S (w[16], w[15], offset_minus_4); + w[29] = amd_bytealign_S (w[15], w[14], offset_minus_4); + w[28] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[27] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[26] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[25] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[24] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[23] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[22] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[21] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[20] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[19] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[18] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[17] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[16] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[15] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[14] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; break; - case 15: - w3[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); - w3[2] = 0; - w3[1] = 0; - w3[0] = 0; - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; - - if (offset_mod_4 == 0) - { - w3[3] = 0; - } - + w[63] = amd_bytealign_S (w[48], w[47], offset_minus_4); + w[62] = amd_bytealign_S (w[47], w[46], offset_minus_4); + w[61] = amd_bytealign_S (w[46], w[45], offset_minus_4); + w[60] = amd_bytealign_S (w[45], w[44], offset_minus_4); + w[59] = amd_bytealign_S (w[44], w[43], offset_minus_4); + w[58] = amd_bytealign_S (w[43], w[42], offset_minus_4); + w[57] = amd_bytealign_S (w[42], w[41], offset_minus_4); + w[56] = amd_bytealign_S (w[41], w[40], offset_minus_4); + w[55] = amd_bytealign_S (w[40], w[39], offset_minus_4); + w[54] = amd_bytealign_S (w[39], w[38], offset_minus_4); + w[53] = amd_bytealign_S (w[38], w[37], offset_minus_4); + w[52] = amd_bytealign_S (w[37], w[36], offset_minus_4); + w[51] = amd_bytealign_S (w[36], w[35], offset_minus_4); + w[50] = amd_bytealign_S (w[35], w[34], offset_minus_4); + w[49] = amd_bytealign_S (w[34], w[33], offset_minus_4); + w[48] = amd_bytealign_S (w[33], w[32], offset_minus_4); + w[47] = amd_bytealign_S (w[32], w[31], offset_minus_4); + w[46] = amd_bytealign_S (w[31], w[30], offset_minus_4); + w[45] = amd_bytealign_S (w[30], w[29], offset_minus_4); + w[44] = amd_bytealign_S (w[29], w[28], offset_minus_4); + w[43] = amd_bytealign_S (w[28], w[27], offset_minus_4); + w[42] = amd_bytealign_S (w[27], w[26], offset_minus_4); + w[41] = amd_bytealign_S (w[26], w[25], offset_minus_4); + w[40] = amd_bytealign_S (w[25], w[24], offset_minus_4); + w[39] = amd_bytealign_S (w[24], w[23], offset_minus_4); + w[38] = amd_bytealign_S (w[23], w[22], offset_minus_4); + w[37] = amd_bytealign_S (w[22], w[21], offset_minus_4); + w[36] = amd_bytealign_S (w[21], w[20], offset_minus_4); + w[35] = amd_bytealign_S (w[20], w[19], offset_minus_4); + w[34] = amd_bytealign_S (w[19], w[18], offset_minus_4); + w[33] = amd_bytealign_S (w[18], w[17], offset_minus_4); + w[32] = amd_bytealign_S (w[17], w[16], offset_minus_4); + w[31] = amd_bytealign_S (w[16], w[15], offset_minus_4); + w[30] = amd_bytealign_S (w[15], w[14], offset_minus_4); + w[29] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[28] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[27] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[26] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[25] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[24] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[23] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[22] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[21] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[20] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[19] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[18] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[17] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[16] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[15] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 16: + w[63] = amd_bytealign_S (w[47], w[46], offset_minus_4); + w[62] = amd_bytealign_S (w[46], w[45], offset_minus_4); + w[61] = amd_bytealign_S (w[45], w[44], offset_minus_4); + w[60] = amd_bytealign_S (w[44], w[43], offset_minus_4); + w[59] = amd_bytealign_S (w[43], w[42], offset_minus_4); + w[58] = amd_bytealign_S (w[42], w[41], offset_minus_4); + w[57] = amd_bytealign_S (w[41], w[40], offset_minus_4); + w[56] = amd_bytealign_S (w[40], w[39], offset_minus_4); + w[55] = amd_bytealign_S (w[39], w[38], offset_minus_4); + w[54] = amd_bytealign_S (w[38], w[37], offset_minus_4); + w[53] = amd_bytealign_S (w[37], w[36], offset_minus_4); + w[52] = amd_bytealign_S (w[36], w[35], offset_minus_4); + w[51] = amd_bytealign_S (w[35], w[34], offset_minus_4); + w[50] = amd_bytealign_S (w[34], w[33], offset_minus_4); + w[49] = amd_bytealign_S (w[33], w[32], offset_minus_4); + w[48] = amd_bytealign_S (w[32], w[31], offset_minus_4); + w[47] = amd_bytealign_S (w[31], w[30], offset_minus_4); + w[46] = amd_bytealign_S (w[30], w[29], offset_minus_4); + w[45] = amd_bytealign_S (w[29], w[28], offset_minus_4); + w[44] = amd_bytealign_S (w[28], w[27], offset_minus_4); + w[43] = amd_bytealign_S (w[27], w[26], offset_minus_4); + w[42] = amd_bytealign_S (w[26], w[25], offset_minus_4); + w[41] = amd_bytealign_S (w[25], w[24], offset_minus_4); + w[40] = amd_bytealign_S (w[24], w[23], offset_minus_4); + w[39] = amd_bytealign_S (w[23], w[22], offset_minus_4); + w[38] = amd_bytealign_S (w[22], w[21], offset_minus_4); + w[37] = amd_bytealign_S (w[21], w[20], offset_minus_4); + w[36] = amd_bytealign_S (w[20], w[19], offset_minus_4); + w[35] = amd_bytealign_S (w[19], w[18], offset_minus_4); + w[34] = amd_bytealign_S (w[18], w[17], offset_minus_4); + w[33] = amd_bytealign_S (w[17], w[16], offset_minus_4); + w[32] = amd_bytealign_S (w[16], w[15], offset_minus_4); + w[31] = amd_bytealign_S (w[15], w[14], offset_minus_4); + w[30] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[29] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[28] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[27] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[26] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[25] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[24] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[23] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[22] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[21] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[20] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[19] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[18] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[17] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[16] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 17: + w[63] = amd_bytealign_S (w[46], w[45], offset_minus_4); + w[62] = amd_bytealign_S (w[45], w[44], offset_minus_4); + w[61] = amd_bytealign_S (w[44], w[43], offset_minus_4); + w[60] = amd_bytealign_S (w[43], w[42], offset_minus_4); + w[59] = amd_bytealign_S (w[42], w[41], offset_minus_4); + w[58] = amd_bytealign_S (w[41], w[40], offset_minus_4); + w[57] = amd_bytealign_S (w[40], w[39], offset_minus_4); + w[56] = amd_bytealign_S (w[39], w[38], offset_minus_4); + w[55] = amd_bytealign_S (w[38], w[37], offset_minus_4); + w[54] = amd_bytealign_S (w[37], w[36], offset_minus_4); + w[53] = amd_bytealign_S (w[36], w[35], offset_minus_4); + w[52] = amd_bytealign_S (w[35], w[34], offset_minus_4); + w[51] = amd_bytealign_S (w[34], w[33], offset_minus_4); + w[50] = amd_bytealign_S (w[33], w[32], offset_minus_4); + w[49] = amd_bytealign_S (w[32], w[31], offset_minus_4); + w[48] = amd_bytealign_S (w[31], w[30], offset_minus_4); + w[47] = amd_bytealign_S (w[30], w[29], offset_minus_4); + w[46] = amd_bytealign_S (w[29], w[28], offset_minus_4); + w[45] = amd_bytealign_S (w[28], w[27], offset_minus_4); + w[44] = amd_bytealign_S (w[27], w[26], offset_minus_4); + w[43] = amd_bytealign_S (w[26], w[25], offset_minus_4); + w[42] = amd_bytealign_S (w[25], w[24], offset_minus_4); + w[41] = amd_bytealign_S (w[24], w[23], offset_minus_4); + w[40] = amd_bytealign_S (w[23], w[22], offset_minus_4); + w[39] = amd_bytealign_S (w[22], w[21], offset_minus_4); + w[38] = amd_bytealign_S (w[21], w[20], offset_minus_4); + w[37] = amd_bytealign_S (w[20], w[19], offset_minus_4); + w[36] = amd_bytealign_S (w[19], w[18], offset_minus_4); + w[35] = amd_bytealign_S (w[18], w[17], offset_minus_4); + w[34] = amd_bytealign_S (w[17], w[16], offset_minus_4); + w[33] = amd_bytealign_S (w[16], w[15], offset_minus_4); + w[32] = amd_bytealign_S (w[15], w[14], offset_minus_4); + w[31] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[30] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[29] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[28] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[27] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[26] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[25] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[24] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[23] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[22] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[21] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[20] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[19] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[18] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[17] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 18: + w[63] = amd_bytealign_S (w[45], w[44], offset_minus_4); + w[62] = amd_bytealign_S (w[44], w[43], offset_minus_4); + w[61] = amd_bytealign_S (w[43], w[42], offset_minus_4); + w[60] = amd_bytealign_S (w[42], w[41], offset_minus_4); + w[59] = amd_bytealign_S (w[41], w[40], offset_minus_4); + w[58] = amd_bytealign_S (w[40], w[39], offset_minus_4); + w[57] = amd_bytealign_S (w[39], w[38], offset_minus_4); + w[56] = amd_bytealign_S (w[38], w[37], offset_minus_4); + w[55] = amd_bytealign_S (w[37], w[36], offset_minus_4); + w[54] = amd_bytealign_S (w[36], w[35], offset_minus_4); + w[53] = amd_bytealign_S (w[35], w[34], offset_minus_4); + w[52] = amd_bytealign_S (w[34], w[33], offset_minus_4); + w[51] = amd_bytealign_S (w[33], w[32], offset_minus_4); + w[50] = amd_bytealign_S (w[32], w[31], offset_minus_4); + w[49] = amd_bytealign_S (w[31], w[30], offset_minus_4); + w[48] = amd_bytealign_S (w[30], w[29], offset_minus_4); + w[47] = amd_bytealign_S (w[29], w[28], offset_minus_4); + w[46] = amd_bytealign_S (w[28], w[27], offset_minus_4); + w[45] = amd_bytealign_S (w[27], w[26], offset_minus_4); + w[44] = amd_bytealign_S (w[26], w[25], offset_minus_4); + w[43] = amd_bytealign_S (w[25], w[24], offset_minus_4); + w[42] = amd_bytealign_S (w[24], w[23], offset_minus_4); + w[41] = amd_bytealign_S (w[23], w[22], offset_minus_4); + w[40] = amd_bytealign_S (w[22], w[21], offset_minus_4); + w[39] = amd_bytealign_S (w[21], w[20], offset_minus_4); + w[38] = amd_bytealign_S (w[20], w[19], offset_minus_4); + w[37] = amd_bytealign_S (w[19], w[18], offset_minus_4); + w[36] = amd_bytealign_S (w[18], w[17], offset_minus_4); + w[35] = amd_bytealign_S (w[17], w[16], offset_minus_4); + w[34] = amd_bytealign_S (w[16], w[15], offset_minus_4); + w[33] = amd_bytealign_S (w[15], w[14], offset_minus_4); + w[32] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[31] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[30] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[29] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[28] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[27] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[26] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[25] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[24] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[23] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[22] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[21] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[20] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[19] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[18] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 19: + w[63] = amd_bytealign_S (w[44], w[43], offset_minus_4); + w[62] = amd_bytealign_S (w[43], w[42], offset_minus_4); + w[61] = amd_bytealign_S (w[42], w[41], offset_minus_4); + w[60] = amd_bytealign_S (w[41], w[40], offset_minus_4); + w[59] = amd_bytealign_S (w[40], w[39], offset_minus_4); + w[58] = amd_bytealign_S (w[39], w[38], offset_minus_4); + w[57] = amd_bytealign_S (w[38], w[37], offset_minus_4); + w[56] = amd_bytealign_S (w[37], w[36], offset_minus_4); + w[55] = amd_bytealign_S (w[36], w[35], offset_minus_4); + w[54] = amd_bytealign_S (w[35], w[34], offset_minus_4); + w[53] = amd_bytealign_S (w[34], w[33], offset_minus_4); + w[52] = amd_bytealign_S (w[33], w[32], offset_minus_4); + w[51] = amd_bytealign_S (w[32], w[31], offset_minus_4); + w[50] = amd_bytealign_S (w[31], w[30], offset_minus_4); + w[49] = amd_bytealign_S (w[30], w[29], offset_minus_4); + w[48] = amd_bytealign_S (w[29], w[28], offset_minus_4); + w[47] = amd_bytealign_S (w[28], w[27], offset_minus_4); + w[46] = amd_bytealign_S (w[27], w[26], offset_minus_4); + w[45] = amd_bytealign_S (w[26], w[25], offset_minus_4); + w[44] = amd_bytealign_S (w[25], w[24], offset_minus_4); + w[43] = amd_bytealign_S (w[24], w[23], offset_minus_4); + w[42] = amd_bytealign_S (w[23], w[22], offset_minus_4); + w[41] = amd_bytealign_S (w[22], w[21], offset_minus_4); + w[40] = amd_bytealign_S (w[21], w[20], offset_minus_4); + w[39] = amd_bytealign_S (w[20], w[19], offset_minus_4); + w[38] = amd_bytealign_S (w[19], w[18], offset_minus_4); + w[37] = amd_bytealign_S (w[18], w[17], offset_minus_4); + w[36] = amd_bytealign_S (w[17], w[16], offset_minus_4); + w[35] = amd_bytealign_S (w[16], w[15], offset_minus_4); + w[34] = amd_bytealign_S (w[15], w[14], offset_minus_4); + w[33] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[32] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[31] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[30] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[29] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[28] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[27] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[26] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[25] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[24] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[23] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[22] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[21] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[20] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[19] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 20: + w[63] = amd_bytealign_S (w[43], w[42], offset_minus_4); + w[62] = amd_bytealign_S (w[42], w[41], offset_minus_4); + w[61] = amd_bytealign_S (w[41], w[40], offset_minus_4); + w[60] = amd_bytealign_S (w[40], w[39], offset_minus_4); + w[59] = amd_bytealign_S (w[39], w[38], offset_minus_4); + w[58] = amd_bytealign_S (w[38], w[37], offset_minus_4); + w[57] = amd_bytealign_S (w[37], w[36], offset_minus_4); + w[56] = amd_bytealign_S (w[36], w[35], offset_minus_4); + w[55] = amd_bytealign_S (w[35], w[34], offset_minus_4); + w[54] = amd_bytealign_S (w[34], w[33], offset_minus_4); + w[53] = amd_bytealign_S (w[33], w[32], offset_minus_4); + w[52] = amd_bytealign_S (w[32], w[31], offset_minus_4); + w[51] = amd_bytealign_S (w[31], w[30], offset_minus_4); + w[50] = amd_bytealign_S (w[30], w[29], offset_minus_4); + w[49] = amd_bytealign_S (w[29], w[28], offset_minus_4); + w[48] = amd_bytealign_S (w[28], w[27], offset_minus_4); + w[47] = amd_bytealign_S (w[27], w[26], offset_minus_4); + w[46] = amd_bytealign_S (w[26], w[25], offset_minus_4); + w[45] = amd_bytealign_S (w[25], w[24], offset_minus_4); + w[44] = amd_bytealign_S (w[24], w[23], offset_minus_4); + w[43] = amd_bytealign_S (w[23], w[22], offset_minus_4); + w[42] = amd_bytealign_S (w[22], w[21], offset_minus_4); + w[41] = amd_bytealign_S (w[21], w[20], offset_minus_4); + w[40] = amd_bytealign_S (w[20], w[19], offset_minus_4); + w[39] = amd_bytealign_S (w[19], w[18], offset_minus_4); + w[38] = amd_bytealign_S (w[18], w[17], offset_minus_4); + w[37] = amd_bytealign_S (w[17], w[16], offset_minus_4); + w[36] = amd_bytealign_S (w[16], w[15], offset_minus_4); + w[35] = amd_bytealign_S (w[15], w[14], offset_minus_4); + w[34] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[33] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[32] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[31] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[30] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[29] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[28] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[27] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[26] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[25] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[24] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[23] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[22] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[21] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[20] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 21: + w[63] = amd_bytealign_S (w[42], w[41], offset_minus_4); + w[62] = amd_bytealign_S (w[41], w[40], offset_minus_4); + w[61] = amd_bytealign_S (w[40], w[39], offset_minus_4); + w[60] = amd_bytealign_S (w[39], w[38], offset_minus_4); + w[59] = amd_bytealign_S (w[38], w[37], offset_minus_4); + w[58] = amd_bytealign_S (w[37], w[36], offset_minus_4); + w[57] = amd_bytealign_S (w[36], w[35], offset_minus_4); + w[56] = amd_bytealign_S (w[35], w[34], offset_minus_4); + w[55] = amd_bytealign_S (w[34], w[33], offset_minus_4); + w[54] = amd_bytealign_S (w[33], w[32], offset_minus_4); + w[53] = amd_bytealign_S (w[32], w[31], offset_minus_4); + w[52] = amd_bytealign_S (w[31], w[30], offset_minus_4); + w[51] = amd_bytealign_S (w[30], w[29], offset_minus_4); + w[50] = amd_bytealign_S (w[29], w[28], offset_minus_4); + w[49] = amd_bytealign_S (w[28], w[27], offset_minus_4); + w[48] = amd_bytealign_S (w[27], w[26], offset_minus_4); + w[47] = amd_bytealign_S (w[26], w[25], offset_minus_4); + w[46] = amd_bytealign_S (w[25], w[24], offset_minus_4); + w[45] = amd_bytealign_S (w[24], w[23], offset_minus_4); + w[44] = amd_bytealign_S (w[23], w[22], offset_minus_4); + w[43] = amd_bytealign_S (w[22], w[21], offset_minus_4); + w[42] = amd_bytealign_S (w[21], w[20], offset_minus_4); + w[41] = amd_bytealign_S (w[20], w[19], offset_minus_4); + w[40] = amd_bytealign_S (w[19], w[18], offset_minus_4); + w[39] = amd_bytealign_S (w[18], w[17], offset_minus_4); + w[38] = amd_bytealign_S (w[17], w[16], offset_minus_4); + w[37] = amd_bytealign_S (w[16], w[15], offset_minus_4); + w[36] = amd_bytealign_S (w[15], w[14], offset_minus_4); + w[35] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[34] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[33] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[32] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[31] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[30] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[29] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[28] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[27] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[26] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[25] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[24] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[23] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[22] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[21] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 22: + w[63] = amd_bytealign_S (w[41], w[40], offset_minus_4); + w[62] = amd_bytealign_S (w[40], w[39], offset_minus_4); + w[61] = amd_bytealign_S (w[39], w[38], offset_minus_4); + w[60] = amd_bytealign_S (w[38], w[37], offset_minus_4); + w[59] = amd_bytealign_S (w[37], w[36], offset_minus_4); + w[58] = amd_bytealign_S (w[36], w[35], offset_minus_4); + w[57] = amd_bytealign_S (w[35], w[34], offset_minus_4); + w[56] = amd_bytealign_S (w[34], w[33], offset_minus_4); + w[55] = amd_bytealign_S (w[33], w[32], offset_minus_4); + w[54] = amd_bytealign_S (w[32], w[31], offset_minus_4); + w[53] = amd_bytealign_S (w[31], w[30], offset_minus_4); + w[52] = amd_bytealign_S (w[30], w[29], offset_minus_4); + w[51] = amd_bytealign_S (w[29], w[28], offset_minus_4); + w[50] = amd_bytealign_S (w[28], w[27], offset_minus_4); + w[49] = amd_bytealign_S (w[27], w[26], offset_minus_4); + w[48] = amd_bytealign_S (w[26], w[25], offset_minus_4); + w[47] = amd_bytealign_S (w[25], w[24], offset_minus_4); + w[46] = amd_bytealign_S (w[24], w[23], offset_minus_4); + w[45] = amd_bytealign_S (w[23], w[22], offset_minus_4); + w[44] = amd_bytealign_S (w[22], w[21], offset_minus_4); + w[43] = amd_bytealign_S (w[21], w[20], offset_minus_4); + w[42] = amd_bytealign_S (w[20], w[19], offset_minus_4); + w[41] = amd_bytealign_S (w[19], w[18], offset_minus_4); + w[40] = amd_bytealign_S (w[18], w[17], offset_minus_4); + w[39] = amd_bytealign_S (w[17], w[16], offset_minus_4); + w[38] = amd_bytealign_S (w[16], w[15], offset_minus_4); + w[37] = amd_bytealign_S (w[15], w[14], offset_minus_4); + w[36] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[35] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[34] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[33] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[32] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[31] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[30] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[29] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[28] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[27] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[26] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[25] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[24] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[23] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[22] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 23: + w[63] = amd_bytealign_S (w[40], w[39], offset_minus_4); + w[62] = amd_bytealign_S (w[39], w[38], offset_minus_4); + w[61] = amd_bytealign_S (w[38], w[37], offset_minus_4); + w[60] = amd_bytealign_S (w[37], w[36], offset_minus_4); + w[59] = amd_bytealign_S (w[36], w[35], offset_minus_4); + w[58] = amd_bytealign_S (w[35], w[34], offset_minus_4); + w[57] = amd_bytealign_S (w[34], w[33], offset_minus_4); + w[56] = amd_bytealign_S (w[33], w[32], offset_minus_4); + w[55] = amd_bytealign_S (w[32], w[31], offset_minus_4); + w[54] = amd_bytealign_S (w[31], w[30], offset_minus_4); + w[53] = amd_bytealign_S (w[30], w[29], offset_minus_4); + w[52] = amd_bytealign_S (w[29], w[28], offset_minus_4); + w[51] = amd_bytealign_S (w[28], w[27], offset_minus_4); + w[50] = amd_bytealign_S (w[27], w[26], offset_minus_4); + w[49] = amd_bytealign_S (w[26], w[25], offset_minus_4); + w[48] = amd_bytealign_S (w[25], w[24], offset_minus_4); + w[47] = amd_bytealign_S (w[24], w[23], offset_minus_4); + w[46] = amd_bytealign_S (w[23], w[22], offset_minus_4); + w[45] = amd_bytealign_S (w[22], w[21], offset_minus_4); + w[44] = amd_bytealign_S (w[21], w[20], offset_minus_4); + w[43] = amd_bytealign_S (w[20], w[19], offset_minus_4); + w[42] = amd_bytealign_S (w[19], w[18], offset_minus_4); + w[41] = amd_bytealign_S (w[18], w[17], offset_minus_4); + w[40] = amd_bytealign_S (w[17], w[16], offset_minus_4); + w[39] = amd_bytealign_S (w[16], w[15], offset_minus_4); + w[38] = amd_bytealign_S (w[15], w[14], offset_minus_4); + w[37] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[36] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[35] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[34] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[33] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[32] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[31] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[30] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[29] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[28] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[27] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[26] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[25] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[24] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[23] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 24: + w[63] = amd_bytealign_S (w[39], w[38], offset_minus_4); + w[62] = amd_bytealign_S (w[38], w[37], offset_minus_4); + w[61] = amd_bytealign_S (w[37], w[36], offset_minus_4); + w[60] = amd_bytealign_S (w[36], w[35], offset_minus_4); + w[59] = amd_bytealign_S (w[35], w[34], offset_minus_4); + w[58] = amd_bytealign_S (w[34], w[33], offset_minus_4); + w[57] = amd_bytealign_S (w[33], w[32], offset_minus_4); + w[56] = amd_bytealign_S (w[32], w[31], offset_minus_4); + w[55] = amd_bytealign_S (w[31], w[30], offset_minus_4); + w[54] = amd_bytealign_S (w[30], w[29], offset_minus_4); + w[53] = amd_bytealign_S (w[29], w[28], offset_minus_4); + w[52] = amd_bytealign_S (w[28], w[27], offset_minus_4); + w[51] = amd_bytealign_S (w[27], w[26], offset_minus_4); + w[50] = amd_bytealign_S (w[26], w[25], offset_minus_4); + w[49] = amd_bytealign_S (w[25], w[24], offset_minus_4); + w[48] = amd_bytealign_S (w[24], w[23], offset_minus_4); + w[47] = amd_bytealign_S (w[23], w[22], offset_minus_4); + w[46] = amd_bytealign_S (w[22], w[21], offset_minus_4); + w[45] = amd_bytealign_S (w[21], w[20], offset_minus_4); + w[44] = amd_bytealign_S (w[20], w[19], offset_minus_4); + w[43] = amd_bytealign_S (w[19], w[18], offset_minus_4); + w[42] = amd_bytealign_S (w[18], w[17], offset_minus_4); + w[41] = amd_bytealign_S (w[17], w[16], offset_minus_4); + w[40] = amd_bytealign_S (w[16], w[15], offset_minus_4); + w[39] = amd_bytealign_S (w[15], w[14], offset_minus_4); + w[38] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[37] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[36] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[35] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[34] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[33] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[32] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[31] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[30] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[29] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[28] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[27] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[26] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[25] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[24] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 25: + w[63] = amd_bytealign_S (w[38], w[37], offset_minus_4); + w[62] = amd_bytealign_S (w[37], w[36], offset_minus_4); + w[61] = amd_bytealign_S (w[36], w[35], offset_minus_4); + w[60] = amd_bytealign_S (w[35], w[34], offset_minus_4); + w[59] = amd_bytealign_S (w[34], w[33], offset_minus_4); + w[58] = amd_bytealign_S (w[33], w[32], offset_minus_4); + w[57] = amd_bytealign_S (w[32], w[31], offset_minus_4); + w[56] = amd_bytealign_S (w[31], w[30], offset_minus_4); + w[55] = amd_bytealign_S (w[30], w[29], offset_minus_4); + w[54] = amd_bytealign_S (w[29], w[28], offset_minus_4); + w[53] = amd_bytealign_S (w[28], w[27], offset_minus_4); + w[52] = amd_bytealign_S (w[27], w[26], offset_minus_4); + w[51] = amd_bytealign_S (w[26], w[25], offset_minus_4); + w[50] = amd_bytealign_S (w[25], w[24], offset_minus_4); + w[49] = amd_bytealign_S (w[24], w[23], offset_minus_4); + w[48] = amd_bytealign_S (w[23], w[22], offset_minus_4); + w[47] = amd_bytealign_S (w[22], w[21], offset_minus_4); + w[46] = amd_bytealign_S (w[21], w[20], offset_minus_4); + w[45] = amd_bytealign_S (w[20], w[19], offset_minus_4); + w[44] = amd_bytealign_S (w[19], w[18], offset_minus_4); + w[43] = amd_bytealign_S (w[18], w[17], offset_minus_4); + w[42] = amd_bytealign_S (w[17], w[16], offset_minus_4); + w[41] = amd_bytealign_S (w[16], w[15], offset_minus_4); + w[40] = amd_bytealign_S (w[15], w[14], offset_minus_4); + w[39] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[38] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[37] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[36] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[35] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[34] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[33] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[32] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[31] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[30] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[29] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[28] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[27] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[26] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[25] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 26: + w[63] = amd_bytealign_S (w[37], w[36], offset_minus_4); + w[62] = amd_bytealign_S (w[36], w[35], offset_minus_4); + w[61] = amd_bytealign_S (w[35], w[34], offset_minus_4); + w[60] = amd_bytealign_S (w[34], w[33], offset_minus_4); + w[59] = amd_bytealign_S (w[33], w[32], offset_minus_4); + w[58] = amd_bytealign_S (w[32], w[31], offset_minus_4); + w[57] = amd_bytealign_S (w[31], w[30], offset_minus_4); + w[56] = amd_bytealign_S (w[30], w[29], offset_minus_4); + w[55] = amd_bytealign_S (w[29], w[28], offset_minus_4); + w[54] = amd_bytealign_S (w[28], w[27], offset_minus_4); + w[53] = amd_bytealign_S (w[27], w[26], offset_minus_4); + w[52] = amd_bytealign_S (w[26], w[25], offset_minus_4); + w[51] = amd_bytealign_S (w[25], w[24], offset_minus_4); + w[50] = amd_bytealign_S (w[24], w[23], offset_minus_4); + w[49] = amd_bytealign_S (w[23], w[22], offset_minus_4); + w[48] = amd_bytealign_S (w[22], w[21], offset_minus_4); + w[47] = amd_bytealign_S (w[21], w[20], offset_minus_4); + w[46] = amd_bytealign_S (w[20], w[19], offset_minus_4); + w[45] = amd_bytealign_S (w[19], w[18], offset_minus_4); + w[44] = amd_bytealign_S (w[18], w[17], offset_minus_4); + w[43] = amd_bytealign_S (w[17], w[16], offset_minus_4); + w[42] = amd_bytealign_S (w[16], w[15], offset_minus_4); + w[41] = amd_bytealign_S (w[15], w[14], offset_minus_4); + w[40] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[39] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[38] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[37] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[36] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[35] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[34] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[33] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[32] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[31] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[30] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[29] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[28] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[27] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[26] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 27: + w[63] = amd_bytealign_S (w[36], w[35], offset_minus_4); + w[62] = amd_bytealign_S (w[35], w[34], offset_minus_4); + w[61] = amd_bytealign_S (w[34], w[33], offset_minus_4); + w[60] = amd_bytealign_S (w[33], w[32], offset_minus_4); + w[59] = amd_bytealign_S (w[32], w[31], offset_minus_4); + w[58] = amd_bytealign_S (w[31], w[30], offset_minus_4); + w[57] = amd_bytealign_S (w[30], w[29], offset_minus_4); + w[56] = amd_bytealign_S (w[29], w[28], offset_minus_4); + w[55] = amd_bytealign_S (w[28], w[27], offset_minus_4); + w[54] = amd_bytealign_S (w[27], w[26], offset_minus_4); + w[53] = amd_bytealign_S (w[26], w[25], offset_minus_4); + w[52] = amd_bytealign_S (w[25], w[24], offset_minus_4); + w[51] = amd_bytealign_S (w[24], w[23], offset_minus_4); + w[50] = amd_bytealign_S (w[23], w[22], offset_minus_4); + w[49] = amd_bytealign_S (w[22], w[21], offset_minus_4); + w[48] = amd_bytealign_S (w[21], w[20], offset_minus_4); + w[47] = amd_bytealign_S (w[20], w[19], offset_minus_4); + w[46] = amd_bytealign_S (w[19], w[18], offset_minus_4); + w[45] = amd_bytealign_S (w[18], w[17], offset_minus_4); + w[44] = amd_bytealign_S (w[17], w[16], offset_minus_4); + w[43] = amd_bytealign_S (w[16], w[15], offset_minus_4); + w[42] = amd_bytealign_S (w[15], w[14], offset_minus_4); + w[41] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[40] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[39] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[38] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[37] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[36] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[35] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[34] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[33] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[32] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[31] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[30] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[29] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[28] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[27] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 28: + w[63] = amd_bytealign_S (w[35], w[34], offset_minus_4); + w[62] = amd_bytealign_S (w[34], w[33], offset_minus_4); + w[61] = amd_bytealign_S (w[33], w[32], offset_minus_4); + w[60] = amd_bytealign_S (w[32], w[31], offset_minus_4); + w[59] = amd_bytealign_S (w[31], w[30], offset_minus_4); + w[58] = amd_bytealign_S (w[30], w[29], offset_minus_4); + w[57] = amd_bytealign_S (w[29], w[28], offset_minus_4); + w[56] = amd_bytealign_S (w[28], w[27], offset_minus_4); + w[55] = amd_bytealign_S (w[27], w[26], offset_minus_4); + w[54] = amd_bytealign_S (w[26], w[25], offset_minus_4); + w[53] = amd_bytealign_S (w[25], w[24], offset_minus_4); + w[52] = amd_bytealign_S (w[24], w[23], offset_minus_4); + w[51] = amd_bytealign_S (w[23], w[22], offset_minus_4); + w[50] = amd_bytealign_S (w[22], w[21], offset_minus_4); + w[49] = amd_bytealign_S (w[21], w[20], offset_minus_4); + w[48] = amd_bytealign_S (w[20], w[19], offset_minus_4); + w[47] = amd_bytealign_S (w[19], w[18], offset_minus_4); + w[46] = amd_bytealign_S (w[18], w[17], offset_minus_4); + w[45] = amd_bytealign_S (w[17], w[16], offset_minus_4); + w[44] = amd_bytealign_S (w[16], w[15], offset_minus_4); + w[43] = amd_bytealign_S (w[15], w[14], offset_minus_4); + w[42] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[41] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[40] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[39] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[38] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[37] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[36] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[35] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[34] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[33] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[32] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[31] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[30] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[29] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[28] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 29: + w[63] = amd_bytealign_S (w[34], w[33], offset_minus_4); + w[62] = amd_bytealign_S (w[33], w[32], offset_minus_4); + w[61] = amd_bytealign_S (w[32], w[31], offset_minus_4); + w[60] = amd_bytealign_S (w[31], w[30], offset_minus_4); + w[59] = amd_bytealign_S (w[30], w[29], offset_minus_4); + w[58] = amd_bytealign_S (w[29], w[28], offset_minus_4); + w[57] = amd_bytealign_S (w[28], w[27], offset_minus_4); + w[56] = amd_bytealign_S (w[27], w[26], offset_minus_4); + w[55] = amd_bytealign_S (w[26], w[25], offset_minus_4); + w[54] = amd_bytealign_S (w[25], w[24], offset_minus_4); + w[53] = amd_bytealign_S (w[24], w[23], offset_minus_4); + w[52] = amd_bytealign_S (w[23], w[22], offset_minus_4); + w[51] = amd_bytealign_S (w[22], w[21], offset_minus_4); + w[50] = amd_bytealign_S (w[21], w[20], offset_minus_4); + w[49] = amd_bytealign_S (w[20], w[19], offset_minus_4); + w[48] = amd_bytealign_S (w[19], w[18], offset_minus_4); + w[47] = amd_bytealign_S (w[18], w[17], offset_minus_4); + w[46] = amd_bytealign_S (w[17], w[16], offset_minus_4); + w[45] = amd_bytealign_S (w[16], w[15], offset_minus_4); + w[44] = amd_bytealign_S (w[15], w[14], offset_minus_4); + w[43] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[42] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[41] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[40] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[39] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[38] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[37] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[36] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[35] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[34] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[33] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[32] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[31] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[30] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[29] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 30: + w[63] = amd_bytealign_S (w[33], w[32], offset_minus_4); + w[62] = amd_bytealign_S (w[32], w[31], offset_minus_4); + w[61] = amd_bytealign_S (w[31], w[30], offset_minus_4); + w[60] = amd_bytealign_S (w[30], w[29], offset_minus_4); + w[59] = amd_bytealign_S (w[29], w[28], offset_minus_4); + w[58] = amd_bytealign_S (w[28], w[27], offset_minus_4); + w[57] = amd_bytealign_S (w[27], w[26], offset_minus_4); + w[56] = amd_bytealign_S (w[26], w[25], offset_minus_4); + w[55] = amd_bytealign_S (w[25], w[24], offset_minus_4); + w[54] = amd_bytealign_S (w[24], w[23], offset_minus_4); + w[53] = amd_bytealign_S (w[23], w[22], offset_minus_4); + w[52] = amd_bytealign_S (w[22], w[21], offset_minus_4); + w[51] = amd_bytealign_S (w[21], w[20], offset_minus_4); + w[50] = amd_bytealign_S (w[20], w[19], offset_minus_4); + w[49] = amd_bytealign_S (w[19], w[18], offset_minus_4); + w[48] = amd_bytealign_S (w[18], w[17], offset_minus_4); + w[47] = amd_bytealign_S (w[17], w[16], offset_minus_4); + w[46] = amd_bytealign_S (w[16], w[15], offset_minus_4); + w[45] = amd_bytealign_S (w[15], w[14], offset_minus_4); + w[44] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[43] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[42] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[41] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[40] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[39] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[38] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[37] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[36] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[35] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[34] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[33] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[32] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[31] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[30] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 31: + w[63] = amd_bytealign_S (w[32], w[31], offset_minus_4); + w[62] = amd_bytealign_S (w[31], w[30], offset_minus_4); + w[61] = amd_bytealign_S (w[30], w[29], offset_minus_4); + w[60] = amd_bytealign_S (w[29], w[28], offset_minus_4); + w[59] = amd_bytealign_S (w[28], w[27], offset_minus_4); + w[58] = amd_bytealign_S (w[27], w[26], offset_minus_4); + w[57] = amd_bytealign_S (w[26], w[25], offset_minus_4); + w[56] = amd_bytealign_S (w[25], w[24], offset_minus_4); + w[55] = amd_bytealign_S (w[24], w[23], offset_minus_4); + w[54] = amd_bytealign_S (w[23], w[22], offset_minus_4); + w[53] = amd_bytealign_S (w[22], w[21], offset_minus_4); + w[52] = amd_bytealign_S (w[21], w[20], offset_minus_4); + w[51] = amd_bytealign_S (w[20], w[19], offset_minus_4); + w[50] = amd_bytealign_S (w[19], w[18], offset_minus_4); + w[49] = amd_bytealign_S (w[18], w[17], offset_minus_4); + w[48] = amd_bytealign_S (w[17], w[16], offset_minus_4); + w[47] = amd_bytealign_S (w[16], w[15], offset_minus_4); + w[46] = amd_bytealign_S (w[15], w[14], offset_minus_4); + w[45] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[44] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[43] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[42] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[41] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[40] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[39] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[38] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[37] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[36] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[35] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[34] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[33] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[32] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[31] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 32: + w[63] = amd_bytealign_S (w[31], w[30], offset_minus_4); + w[62] = amd_bytealign_S (w[30], w[29], offset_minus_4); + w[61] = amd_bytealign_S (w[29], w[28], offset_minus_4); + w[60] = amd_bytealign_S (w[28], w[27], offset_minus_4); + w[59] = amd_bytealign_S (w[27], w[26], offset_minus_4); + w[58] = amd_bytealign_S (w[26], w[25], offset_minus_4); + w[57] = amd_bytealign_S (w[25], w[24], offset_minus_4); + w[56] = amd_bytealign_S (w[24], w[23], offset_minus_4); + w[55] = amd_bytealign_S (w[23], w[22], offset_minus_4); + w[54] = amd_bytealign_S (w[22], w[21], offset_minus_4); + w[53] = amd_bytealign_S (w[21], w[20], offset_minus_4); + w[52] = amd_bytealign_S (w[20], w[19], offset_minus_4); + w[51] = amd_bytealign_S (w[19], w[18], offset_minus_4); + w[50] = amd_bytealign_S (w[18], w[17], offset_minus_4); + w[49] = amd_bytealign_S (w[17], w[16], offset_minus_4); + w[48] = amd_bytealign_S (w[16], w[15], offset_minus_4); + w[47] = amd_bytealign_S (w[15], w[14], offset_minus_4); + w[46] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[45] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[44] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[43] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[42] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[41] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[40] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[39] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[38] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[37] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[36] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[35] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[34] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[33] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[32] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 33: + w[63] = amd_bytealign_S (w[30], w[29], offset_minus_4); + w[62] = amd_bytealign_S (w[29], w[28], offset_minus_4); + w[61] = amd_bytealign_S (w[28], w[27], offset_minus_4); + w[60] = amd_bytealign_S (w[27], w[26], offset_minus_4); + w[59] = amd_bytealign_S (w[26], w[25], offset_minus_4); + w[58] = amd_bytealign_S (w[25], w[24], offset_minus_4); + w[57] = amd_bytealign_S (w[24], w[23], offset_minus_4); + w[56] = amd_bytealign_S (w[23], w[22], offset_minus_4); + w[55] = amd_bytealign_S (w[22], w[21], offset_minus_4); + w[54] = amd_bytealign_S (w[21], w[20], offset_minus_4); + w[53] = amd_bytealign_S (w[20], w[19], offset_minus_4); + w[52] = amd_bytealign_S (w[19], w[18], offset_minus_4); + w[51] = amd_bytealign_S (w[18], w[17], offset_minus_4); + w[50] = amd_bytealign_S (w[17], w[16], offset_minus_4); + w[49] = amd_bytealign_S (w[16], w[15], offset_minus_4); + w[48] = amd_bytealign_S (w[15], w[14], offset_minus_4); + w[47] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[46] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[45] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[44] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[43] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[42] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[41] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[40] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[39] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[38] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[37] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[36] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[35] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[34] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[33] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 34: + w[63] = amd_bytealign_S (w[29], w[28], offset_minus_4); + w[62] = amd_bytealign_S (w[28], w[27], offset_minus_4); + w[61] = amd_bytealign_S (w[27], w[26], offset_minus_4); + w[60] = amd_bytealign_S (w[26], w[25], offset_minus_4); + w[59] = amd_bytealign_S (w[25], w[24], offset_minus_4); + w[58] = amd_bytealign_S (w[24], w[23], offset_minus_4); + w[57] = amd_bytealign_S (w[23], w[22], offset_minus_4); + w[56] = amd_bytealign_S (w[22], w[21], offset_minus_4); + w[55] = amd_bytealign_S (w[21], w[20], offset_minus_4); + w[54] = amd_bytealign_S (w[20], w[19], offset_minus_4); + w[53] = amd_bytealign_S (w[19], w[18], offset_minus_4); + w[52] = amd_bytealign_S (w[18], w[17], offset_minus_4); + w[51] = amd_bytealign_S (w[17], w[16], offset_minus_4); + w[50] = amd_bytealign_S (w[16], w[15], offset_minus_4); + w[49] = amd_bytealign_S (w[15], w[14], offset_minus_4); + w[48] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[47] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[46] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[45] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[44] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[43] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[42] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[41] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[40] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[39] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[38] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[37] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[36] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[35] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[34] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 35: + w[63] = amd_bytealign_S (w[28], w[27], offset_minus_4); + w[62] = amd_bytealign_S (w[27], w[26], offset_minus_4); + w[61] = amd_bytealign_S (w[26], w[25], offset_minus_4); + w[60] = amd_bytealign_S (w[25], w[24], offset_minus_4); + w[59] = amd_bytealign_S (w[24], w[23], offset_minus_4); + w[58] = amd_bytealign_S (w[23], w[22], offset_minus_4); + w[57] = amd_bytealign_S (w[22], w[21], offset_minus_4); + w[56] = amd_bytealign_S (w[21], w[20], offset_minus_4); + w[55] = amd_bytealign_S (w[20], w[19], offset_minus_4); + w[54] = amd_bytealign_S (w[19], w[18], offset_minus_4); + w[53] = amd_bytealign_S (w[18], w[17], offset_minus_4); + w[52] = amd_bytealign_S (w[17], w[16], offset_minus_4); + w[51] = amd_bytealign_S (w[16], w[15], offset_minus_4); + w[50] = amd_bytealign_S (w[15], w[14], offset_minus_4); + w[49] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[48] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[47] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[46] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[45] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[44] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[43] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[42] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[41] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[40] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[39] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[38] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[37] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[36] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[35] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 36: + w[63] = amd_bytealign_S (w[27], w[26], offset_minus_4); + w[62] = amd_bytealign_S (w[26], w[25], offset_minus_4); + w[61] = amd_bytealign_S (w[25], w[24], offset_minus_4); + w[60] = amd_bytealign_S (w[24], w[23], offset_minus_4); + w[59] = amd_bytealign_S (w[23], w[22], offset_minus_4); + w[58] = amd_bytealign_S (w[22], w[21], offset_minus_4); + w[57] = amd_bytealign_S (w[21], w[20], offset_minus_4); + w[56] = amd_bytealign_S (w[20], w[19], offset_minus_4); + w[55] = amd_bytealign_S (w[19], w[18], offset_minus_4); + w[54] = amd_bytealign_S (w[18], w[17], offset_minus_4); + w[53] = amd_bytealign_S (w[17], w[16], offset_minus_4); + w[52] = amd_bytealign_S (w[16], w[15], offset_minus_4); + w[51] = amd_bytealign_S (w[15], w[14], offset_minus_4); + w[50] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[49] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[48] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[47] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[46] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[45] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[44] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[43] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[42] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[41] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[40] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[39] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[38] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[37] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[36] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 37: + w[63] = amd_bytealign_S (w[26], w[25], offset_minus_4); + w[62] = amd_bytealign_S (w[25], w[24], offset_minus_4); + w[61] = amd_bytealign_S (w[24], w[23], offset_minus_4); + w[60] = amd_bytealign_S (w[23], w[22], offset_minus_4); + w[59] = amd_bytealign_S (w[22], w[21], offset_minus_4); + w[58] = amd_bytealign_S (w[21], w[20], offset_minus_4); + w[57] = amd_bytealign_S (w[20], w[19], offset_minus_4); + w[56] = amd_bytealign_S (w[19], w[18], offset_minus_4); + w[55] = amd_bytealign_S (w[18], w[17], offset_minus_4); + w[54] = amd_bytealign_S (w[17], w[16], offset_minus_4); + w[53] = amd_bytealign_S (w[16], w[15], offset_minus_4); + w[52] = amd_bytealign_S (w[15], w[14], offset_minus_4); + w[51] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[50] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[49] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[48] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[47] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[46] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[45] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[44] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[43] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[42] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[41] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[40] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[39] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[38] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[37] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 38: + w[63] = amd_bytealign_S (w[25], w[24], offset_minus_4); + w[62] = amd_bytealign_S (w[24], w[23], offset_minus_4); + w[61] = amd_bytealign_S (w[23], w[22], offset_minus_4); + w[60] = amd_bytealign_S (w[22], w[21], offset_minus_4); + w[59] = amd_bytealign_S (w[21], w[20], offset_minus_4); + w[58] = amd_bytealign_S (w[20], w[19], offset_minus_4); + w[57] = amd_bytealign_S (w[19], w[18], offset_minus_4); + w[56] = amd_bytealign_S (w[18], w[17], offset_minus_4); + w[55] = amd_bytealign_S (w[17], w[16], offset_minus_4); + w[54] = amd_bytealign_S (w[16], w[15], offset_minus_4); + w[53] = amd_bytealign_S (w[15], w[14], offset_minus_4); + w[52] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[51] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[50] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[49] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[48] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[47] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[46] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[45] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[44] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[43] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[42] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[41] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[40] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[39] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[38] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 39: + w[63] = amd_bytealign_S (w[24], w[23], offset_minus_4); + w[62] = amd_bytealign_S (w[23], w[22], offset_minus_4); + w[61] = amd_bytealign_S (w[22], w[21], offset_minus_4); + w[60] = amd_bytealign_S (w[21], w[20], offset_minus_4); + w[59] = amd_bytealign_S (w[20], w[19], offset_minus_4); + w[58] = amd_bytealign_S (w[19], w[18], offset_minus_4); + w[57] = amd_bytealign_S (w[18], w[17], offset_minus_4); + w[56] = amd_bytealign_S (w[17], w[16], offset_minus_4); + w[55] = amd_bytealign_S (w[16], w[15], offset_minus_4); + w[54] = amd_bytealign_S (w[15], w[14], offset_minus_4); + w[53] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[52] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[51] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[50] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[49] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[48] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[47] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[46] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[45] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[44] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[43] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[42] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[41] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[40] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[39] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 40: + w[63] = amd_bytealign_S (w[23], w[22], offset_minus_4); + w[62] = amd_bytealign_S (w[22], w[21], offset_minus_4); + w[61] = amd_bytealign_S (w[21], w[20], offset_minus_4); + w[60] = amd_bytealign_S (w[20], w[19], offset_minus_4); + w[59] = amd_bytealign_S (w[19], w[18], offset_minus_4); + w[58] = amd_bytealign_S (w[18], w[17], offset_minus_4); + w[57] = amd_bytealign_S (w[17], w[16], offset_minus_4); + w[56] = amd_bytealign_S (w[16], w[15], offset_minus_4); + w[55] = amd_bytealign_S (w[15], w[14], offset_minus_4); + w[54] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[53] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[52] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[51] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[50] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[49] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[48] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[47] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[46] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[45] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[44] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[43] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[42] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[41] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[40] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 41: + w[63] = amd_bytealign_S (w[22], w[21], offset_minus_4); + w[62] = amd_bytealign_S (w[21], w[20], offset_minus_4); + w[61] = amd_bytealign_S (w[20], w[19], offset_minus_4); + w[60] = amd_bytealign_S (w[19], w[18], offset_minus_4); + w[59] = amd_bytealign_S (w[18], w[17], offset_minus_4); + w[58] = amd_bytealign_S (w[17], w[16], offset_minus_4); + w[57] = amd_bytealign_S (w[16], w[15], offset_minus_4); + w[56] = amd_bytealign_S (w[15], w[14], offset_minus_4); + w[55] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[54] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[53] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[52] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[51] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[50] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[49] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[48] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[47] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[46] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[45] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[44] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[43] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[42] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[41] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 42: + w[63] = amd_bytealign_S (w[21], w[20], offset_minus_4); + w[62] = amd_bytealign_S (w[20], w[19], offset_minus_4); + w[61] = amd_bytealign_S (w[19], w[18], offset_minus_4); + w[60] = amd_bytealign_S (w[18], w[17], offset_minus_4); + w[59] = amd_bytealign_S (w[17], w[16], offset_minus_4); + w[58] = amd_bytealign_S (w[16], w[15], offset_minus_4); + w[57] = amd_bytealign_S (w[15], w[14], offset_minus_4); + w[56] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[55] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[54] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[53] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[52] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[51] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[50] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[49] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[48] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[47] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[46] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[45] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[44] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[43] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[42] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 43: + w[63] = amd_bytealign_S (w[20], w[19], offset_minus_4); + w[62] = amd_bytealign_S (w[19], w[18], offset_minus_4); + w[61] = amd_bytealign_S (w[18], w[17], offset_minus_4); + w[60] = amd_bytealign_S (w[17], w[16], offset_minus_4); + w[59] = amd_bytealign_S (w[16], w[15], offset_minus_4); + w[58] = amd_bytealign_S (w[15], w[14], offset_minus_4); + w[57] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[56] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[55] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[54] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[53] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[52] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[51] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[50] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[49] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[48] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[47] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[46] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[45] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[44] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[43] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 44: + w[63] = amd_bytealign_S (w[19], w[18], offset_minus_4); + w[62] = amd_bytealign_S (w[18], w[17], offset_minus_4); + w[61] = amd_bytealign_S (w[17], w[16], offset_minus_4); + w[60] = amd_bytealign_S (w[16], w[15], offset_minus_4); + w[59] = amd_bytealign_S (w[15], w[14], offset_minus_4); + w[58] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[57] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[56] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[55] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[54] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[53] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[52] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[51] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[50] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[49] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[48] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[47] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[46] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[45] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[44] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 45: + w[63] = amd_bytealign_S (w[18], w[17], offset_minus_4); + w[62] = amd_bytealign_S (w[17], w[16], offset_minus_4); + w[61] = amd_bytealign_S (w[16], w[15], offset_minus_4); + w[60] = amd_bytealign_S (w[15], w[14], offset_minus_4); + w[59] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[58] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[57] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[56] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[55] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[54] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[53] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[52] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[51] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[50] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[49] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[48] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[47] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[46] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[45] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 46: + w[63] = amd_bytealign_S (w[17], w[16], offset_minus_4); + w[62] = amd_bytealign_S (w[16], w[15], offset_minus_4); + w[61] = amd_bytealign_S (w[15], w[14], offset_minus_4); + w[60] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[59] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[58] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[57] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[56] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[55] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[54] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[53] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[52] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[51] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[50] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[49] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[48] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[47] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[46] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 47: + w[63] = amd_bytealign_S (w[16], w[15], offset_minus_4); + w[62] = amd_bytealign_S (w[15], w[14], offset_minus_4); + w[61] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[60] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[59] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[58] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[57] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[56] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[55] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[54] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[53] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[52] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[51] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[50] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[49] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[48] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[47] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 48: + w[63] = amd_bytealign_S (w[15], w[14], offset_minus_4); + w[62] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[61] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[60] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[59] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[58] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[57] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[56] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[55] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[54] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[53] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[52] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[51] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[50] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[49] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[48] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 49: + w[63] = amd_bytealign_S (w[14], w[13], offset_minus_4); + w[62] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[61] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[60] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[59] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[58] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[57] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[56] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[55] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[54] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[53] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[52] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[51] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[50] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[49] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 50: + w[63] = amd_bytealign_S (w[13], w[12], offset_minus_4); + w[62] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[61] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[60] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[59] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[58] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[57] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[56] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[55] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[54] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[53] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[52] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[51] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[50] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 51: + w[63] = amd_bytealign_S (w[12], w[11], offset_minus_4); + w[62] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[61] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[60] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[59] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[58] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[57] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[56] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[55] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[54] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[53] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[52] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[51] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 52: + w[63] = amd_bytealign_S (w[11], w[10], offset_minus_4); + w[62] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[61] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[60] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[59] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[58] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[57] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[56] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[55] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[54] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[53] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[52] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 53: + w[63] = amd_bytealign_S (w[10], w[ 9], offset_minus_4); + w[62] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[61] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[60] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[59] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[58] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[57] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[56] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[55] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[54] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[53] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 54: + w[63] = amd_bytealign_S (w[ 9], w[ 8], offset_minus_4); + w[62] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[61] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[60] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[59] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[58] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[57] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[56] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[55] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[54] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 55: + w[63] = amd_bytealign_S (w[ 8], w[ 7], offset_minus_4); + w[62] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[61] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[60] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[59] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[58] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[57] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[56] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[55] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 56: + w[63] = amd_bytealign_S (w[ 7], w[ 6], offset_minus_4); + w[62] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[61] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[60] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[59] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[58] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[57] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[56] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 57: + w[63] = amd_bytealign_S (w[ 6], w[ 5], offset_minus_4); + w[62] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[61] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[60] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[59] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[58] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[57] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 58: + w[63] = amd_bytealign_S (w[ 5], w[ 4], offset_minus_4); + w[62] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[61] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[60] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[59] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[58] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 59: + w[63] = amd_bytealign_S (w[ 4], w[ 3], offset_minus_4); + w[62] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[61] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[60] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[59] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 60: + w[63] = amd_bytealign_S (w[ 3], w[ 2], offset_minus_4); + w[62] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[61] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[60] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 61: + w[63] = amd_bytealign_S (w[ 2], w[ 1], offset_minus_4); + w[62] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[61] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 62: + w[63] = amd_bytealign_S (w[ 1], w[ 0], offset_minus_4); + w[62] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[61] = 0; + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 63: + w[63] = amd_bytealign_S (w[ 0], 0, offset_minus_4); + w[62] = 0; + w[61] = 0; + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; break; } #endif @@ -545,308 +4250,4229 @@ inline void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w switch (offset / 4) { - case 0: - w3[3] = __byte_perm_S (w3[2], w3[3], selector); - w3[2] = __byte_perm_S (w3[1], w3[2], selector); - w3[1] = __byte_perm_S (w3[0], w3[1], selector); - w3[0] = __byte_perm_S (w2[3], w3[0], selector); - w2[3] = __byte_perm_S (w2[2], w2[3], selector); - w2[2] = __byte_perm_S (w2[1], w2[2], selector); - w2[1] = __byte_perm_S (w2[0], w2[1], selector); - w2[0] = __byte_perm_S (w1[3], w2[0], selector); - w1[3] = __byte_perm_S (w1[2], w1[3], selector); - w1[2] = __byte_perm_S (w1[1], w1[2], selector); - w1[1] = __byte_perm_S (w1[0], w1[1], selector); - w1[0] = __byte_perm_S (w0[3], w1[0], selector); - w0[3] = __byte_perm_S (w0[2], w0[3], selector); - w0[2] = __byte_perm_S (w0[1], w0[2], selector); - w0[1] = __byte_perm_S (w0[0], w0[1], selector); - w0[0] = __byte_perm_S ( 0, w0[0], selector); + case 0: + w[63] = __byte_perm_S (w[62], w[63], selector); + w[62] = __byte_perm_S (w[61], w[62], selector); + w[61] = __byte_perm_S (w[60], w[61], selector); + w[60] = __byte_perm_S (w[59], w[60], selector); + w[59] = __byte_perm_S (w[58], w[59], selector); + w[58] = __byte_perm_S (w[57], w[58], selector); + w[57] = __byte_perm_S (w[56], w[57], selector); + w[56] = __byte_perm_S (w[55], w[56], selector); + w[55] = __byte_perm_S (w[54], w[55], selector); + w[54] = __byte_perm_S (w[53], w[54], selector); + w[53] = __byte_perm_S (w[52], w[53], selector); + w[52] = __byte_perm_S (w[51], w[52], selector); + w[51] = __byte_perm_S (w[50], w[51], selector); + w[50] = __byte_perm_S (w[49], w[50], selector); + w[49] = __byte_perm_S (w[48], w[49], selector); + w[48] = __byte_perm_S (w[47], w[48], selector); + w[47] = __byte_perm_S (w[46], w[47], selector); + w[46] = __byte_perm_S (w[45], w[46], selector); + w[45] = __byte_perm_S (w[44], w[45], selector); + w[44] = __byte_perm_S (w[43], w[44], selector); + w[43] = __byte_perm_S (w[42], w[43], selector); + w[42] = __byte_perm_S (w[41], w[42], selector); + w[41] = __byte_perm_S (w[40], w[41], selector); + w[40] = __byte_perm_S (w[39], w[40], selector); + w[39] = __byte_perm_S (w[38], w[39], selector); + w[38] = __byte_perm_S (w[37], w[38], selector); + w[37] = __byte_perm_S (w[36], w[37], selector); + w[36] = __byte_perm_S (w[35], w[36], selector); + w[35] = __byte_perm_S (w[34], w[35], selector); + w[34] = __byte_perm_S (w[33], w[34], selector); + w[33] = __byte_perm_S (w[32], w[33], selector); + w[32] = __byte_perm_S (w[31], w[32], selector); + w[31] = __byte_perm_S (w[30], w[31], selector); + w[30] = __byte_perm_S (w[29], w[30], selector); + w[29] = __byte_perm_S (w[28], w[29], selector); + w[28] = __byte_perm_S (w[27], w[28], selector); + w[27] = __byte_perm_S (w[26], w[27], selector); + w[26] = __byte_perm_S (w[25], w[26], selector); + w[25] = __byte_perm_S (w[24], w[25], selector); + w[24] = __byte_perm_S (w[23], w[24], selector); + w[23] = __byte_perm_S (w[22], w[23], selector); + w[22] = __byte_perm_S (w[21], w[22], selector); + w[21] = __byte_perm_S (w[20], w[21], selector); + w[20] = __byte_perm_S (w[19], w[20], selector); + w[19] = __byte_perm_S (w[18], w[19], selector); + w[18] = __byte_perm_S (w[17], w[18], selector); + w[17] = __byte_perm_S (w[16], w[17], selector); + w[16] = __byte_perm_S (w[15], w[16], selector); + w[15] = __byte_perm_S (w[14], w[15], selector); + w[14] = __byte_perm_S (w[13], w[14], selector); + w[13] = __byte_perm_S (w[12], w[13], selector); + w[12] = __byte_perm_S (w[11], w[12], selector); + w[11] = __byte_perm_S (w[10], w[11], selector); + w[10] = __byte_perm_S (w[ 9], w[10], selector); + w[ 9] = __byte_perm_S (w[ 8], w[ 9], selector); + w[ 8] = __byte_perm_S (w[ 7], w[ 8], selector); + w[ 7] = __byte_perm_S (w[ 6], w[ 7], selector); + w[ 6] = __byte_perm_S (w[ 5], w[ 6], selector); + w[ 5] = __byte_perm_S (w[ 4], w[ 5], selector); + w[ 4] = __byte_perm_S (w[ 3], w[ 4], selector); + w[ 3] = __byte_perm_S (w[ 2], w[ 3], selector); + w[ 2] = __byte_perm_S (w[ 1], w[ 2], selector); + w[ 1] = __byte_perm_S (w[ 0], w[ 1], selector); + w[ 0] = __byte_perm_S ( 0, w[ 0], selector); break; - - case 1: - w3[3] = __byte_perm_S (w3[1], w3[2], selector); - w3[2] = __byte_perm_S (w3[0], w3[1], selector); - w3[1] = __byte_perm_S (w2[3], w3[0], selector); - w3[0] = __byte_perm_S (w2[2], w2[3], selector); - w2[3] = __byte_perm_S (w2[1], w2[2], selector); - w2[2] = __byte_perm_S (w2[0], w2[1], selector); - w2[1] = __byte_perm_S (w1[3], w2[0], selector); - w2[0] = __byte_perm_S (w1[2], w1[3], selector); - w1[3] = __byte_perm_S (w1[1], w1[2], selector); - w1[2] = __byte_perm_S (w1[0], w1[1], selector); - w1[1] = __byte_perm_S (w0[3], w1[0], selector); - w1[0] = __byte_perm_S (w0[2], w0[3], selector); - w0[3] = __byte_perm_S (w0[1], w0[2], selector); - w0[2] = __byte_perm_S (w0[0], w0[1], selector); - w0[1] = __byte_perm_S ( 0, w0[0], selector); - w0[0] = 0; + case 1: + w[63] = __byte_perm_S (w[61], w[62], selector); + w[62] = __byte_perm_S (w[60], w[61], selector); + w[61] = __byte_perm_S (w[59], w[60], selector); + w[60] = __byte_perm_S (w[58], w[59], selector); + w[59] = __byte_perm_S (w[57], w[58], selector); + w[58] = __byte_perm_S (w[56], w[57], selector); + w[57] = __byte_perm_S (w[55], w[56], selector); + w[56] = __byte_perm_S (w[54], w[55], selector); + w[55] = __byte_perm_S (w[53], w[54], selector); + w[54] = __byte_perm_S (w[52], w[53], selector); + w[53] = __byte_perm_S (w[51], w[52], selector); + w[52] = __byte_perm_S (w[50], w[51], selector); + w[51] = __byte_perm_S (w[49], w[50], selector); + w[50] = __byte_perm_S (w[48], w[49], selector); + w[49] = __byte_perm_S (w[47], w[48], selector); + w[48] = __byte_perm_S (w[46], w[47], selector); + w[47] = __byte_perm_S (w[45], w[46], selector); + w[46] = __byte_perm_S (w[44], w[45], selector); + w[45] = __byte_perm_S (w[43], w[44], selector); + w[44] = __byte_perm_S (w[42], w[43], selector); + w[43] = __byte_perm_S (w[41], w[42], selector); + w[42] = __byte_perm_S (w[40], w[41], selector); + w[41] = __byte_perm_S (w[39], w[40], selector); + w[40] = __byte_perm_S (w[38], w[39], selector); + w[39] = __byte_perm_S (w[37], w[38], selector); + w[38] = __byte_perm_S (w[36], w[37], selector); + w[37] = __byte_perm_S (w[35], w[36], selector); + w[36] = __byte_perm_S (w[34], w[35], selector); + w[35] = __byte_perm_S (w[33], w[34], selector); + w[34] = __byte_perm_S (w[32], w[33], selector); + w[33] = __byte_perm_S (w[31], w[32], selector); + w[32] = __byte_perm_S (w[30], w[31], selector); + w[31] = __byte_perm_S (w[29], w[30], selector); + w[30] = __byte_perm_S (w[28], w[29], selector); + w[29] = __byte_perm_S (w[27], w[28], selector); + w[28] = __byte_perm_S (w[26], w[27], selector); + w[27] = __byte_perm_S (w[25], w[26], selector); + w[26] = __byte_perm_S (w[24], w[25], selector); + w[25] = __byte_perm_S (w[23], w[24], selector); + w[24] = __byte_perm_S (w[22], w[23], selector); + w[23] = __byte_perm_S (w[21], w[22], selector); + w[22] = __byte_perm_S (w[20], w[21], selector); + w[21] = __byte_perm_S (w[19], w[20], selector); + w[20] = __byte_perm_S (w[18], w[19], selector); + w[19] = __byte_perm_S (w[17], w[18], selector); + w[18] = __byte_perm_S (w[16], w[17], selector); + w[17] = __byte_perm_S (w[15], w[16], selector); + w[16] = __byte_perm_S (w[14], w[15], selector); + w[15] = __byte_perm_S (w[13], w[14], selector); + w[14] = __byte_perm_S (w[12], w[13], selector); + w[13] = __byte_perm_S (w[11], w[12], selector); + w[12] = __byte_perm_S (w[10], w[11], selector); + w[11] = __byte_perm_S (w[ 9], w[10], selector); + w[10] = __byte_perm_S (w[ 8], w[ 9], selector); + w[ 9] = __byte_perm_S (w[ 7], w[ 8], selector); + w[ 8] = __byte_perm_S (w[ 6], w[ 7], selector); + w[ 7] = __byte_perm_S (w[ 5], w[ 6], selector); + w[ 6] = __byte_perm_S (w[ 4], w[ 5], selector); + w[ 5] = __byte_perm_S (w[ 3], w[ 4], selector); + w[ 4] = __byte_perm_S (w[ 2], w[ 3], selector); + w[ 3] = __byte_perm_S (w[ 1], w[ 2], selector); + w[ 2] = __byte_perm_S (w[ 0], w[ 1], selector); + w[ 1] = __byte_perm_S ( 0, w[ 0], selector); + w[ 0] = 0; break; - - case 2: - w3[3] = __byte_perm_S (w3[0], w3[1], selector); - w3[2] = __byte_perm_S (w2[3], w3[0], selector); - w3[1] = __byte_perm_S (w2[2], w2[3], selector); - w3[0] = __byte_perm_S (w2[1], w2[2], selector); - w2[3] = __byte_perm_S (w2[0], w2[1], selector); - w2[2] = __byte_perm_S (w1[3], w2[0], selector); - w2[1] = __byte_perm_S (w1[2], w1[3], selector); - w2[0] = __byte_perm_S (w1[1], w1[2], selector); - w1[3] = __byte_perm_S (w1[0], w1[1], selector); - w1[2] = __byte_perm_S (w0[3], w1[0], selector); - w1[1] = __byte_perm_S (w0[2], w0[3], selector); - w1[0] = __byte_perm_S (w0[1], w0[2], selector); - w0[3] = __byte_perm_S (w0[0], w0[1], selector); - w0[2] = __byte_perm_S ( 0, w0[0], selector); - w0[1] = 0; - w0[0] = 0; + case 2: + w[63] = __byte_perm_S (w[60], w[61], selector); + w[62] = __byte_perm_S (w[59], w[60], selector); + w[61] = __byte_perm_S (w[58], w[59], selector); + w[60] = __byte_perm_S (w[57], w[58], selector); + w[59] = __byte_perm_S (w[56], w[57], selector); + w[58] = __byte_perm_S (w[55], w[56], selector); + w[57] = __byte_perm_S (w[54], w[55], selector); + w[56] = __byte_perm_S (w[53], w[54], selector); + w[55] = __byte_perm_S (w[52], w[53], selector); + w[54] = __byte_perm_S (w[51], w[52], selector); + w[53] = __byte_perm_S (w[50], w[51], selector); + w[52] = __byte_perm_S (w[49], w[50], selector); + w[51] = __byte_perm_S (w[48], w[49], selector); + w[50] = __byte_perm_S (w[47], w[48], selector); + w[49] = __byte_perm_S (w[46], w[47], selector); + w[48] = __byte_perm_S (w[45], w[46], selector); + w[47] = __byte_perm_S (w[44], w[45], selector); + w[46] = __byte_perm_S (w[43], w[44], selector); + w[45] = __byte_perm_S (w[42], w[43], selector); + w[44] = __byte_perm_S (w[41], w[42], selector); + w[43] = __byte_perm_S (w[40], w[41], selector); + w[42] = __byte_perm_S (w[39], w[40], selector); + w[41] = __byte_perm_S (w[38], w[39], selector); + w[40] = __byte_perm_S (w[37], w[38], selector); + w[39] = __byte_perm_S (w[36], w[37], selector); + w[38] = __byte_perm_S (w[35], w[36], selector); + w[37] = __byte_perm_S (w[34], w[35], selector); + w[36] = __byte_perm_S (w[33], w[34], selector); + w[35] = __byte_perm_S (w[32], w[33], selector); + w[34] = __byte_perm_S (w[31], w[32], selector); + w[33] = __byte_perm_S (w[30], w[31], selector); + w[32] = __byte_perm_S (w[29], w[30], selector); + w[31] = __byte_perm_S (w[28], w[29], selector); + w[30] = __byte_perm_S (w[27], w[28], selector); + w[29] = __byte_perm_S (w[26], w[27], selector); + w[28] = __byte_perm_S (w[25], w[26], selector); + w[27] = __byte_perm_S (w[24], w[25], selector); + w[26] = __byte_perm_S (w[23], w[24], selector); + w[25] = __byte_perm_S (w[22], w[23], selector); + w[24] = __byte_perm_S (w[21], w[22], selector); + w[23] = __byte_perm_S (w[20], w[21], selector); + w[22] = __byte_perm_S (w[19], w[20], selector); + w[21] = __byte_perm_S (w[18], w[19], selector); + w[20] = __byte_perm_S (w[17], w[18], selector); + w[19] = __byte_perm_S (w[16], w[17], selector); + w[18] = __byte_perm_S (w[15], w[16], selector); + w[17] = __byte_perm_S (w[14], w[15], selector); + w[16] = __byte_perm_S (w[13], w[14], selector); + w[15] = __byte_perm_S (w[12], w[13], selector); + w[14] = __byte_perm_S (w[11], w[12], selector); + w[13] = __byte_perm_S (w[10], w[11], selector); + w[12] = __byte_perm_S (w[ 9], w[10], selector); + w[11] = __byte_perm_S (w[ 8], w[ 9], selector); + w[10] = __byte_perm_S (w[ 7], w[ 8], selector); + w[ 9] = __byte_perm_S (w[ 6], w[ 7], selector); + w[ 8] = __byte_perm_S (w[ 5], w[ 6], selector); + w[ 7] = __byte_perm_S (w[ 4], w[ 5], selector); + w[ 6] = __byte_perm_S (w[ 3], w[ 4], selector); + w[ 5] = __byte_perm_S (w[ 2], w[ 3], selector); + w[ 4] = __byte_perm_S (w[ 1], w[ 2], selector); + w[ 3] = __byte_perm_S (w[ 0], w[ 1], selector); + w[ 2] = __byte_perm_S ( 0, w[ 0], selector); + w[ 1] = 0; + w[ 0] = 0; break; - - case 3: - w3[3] = __byte_perm_S (w2[3], w3[0], selector); - w3[2] = __byte_perm_S (w2[2], w2[3], selector); - w3[1] = __byte_perm_S (w2[1], w2[2], selector); - w3[0] = __byte_perm_S (w2[0], w2[1], selector); - w2[3] = __byte_perm_S (w1[3], w2[0], selector); - w2[2] = __byte_perm_S (w1[2], w1[3], selector); - w2[1] = __byte_perm_S (w1[1], w1[2], selector); - w2[0] = __byte_perm_S (w1[0], w1[1], selector); - w1[3] = __byte_perm_S (w0[3], w1[0], selector); - w1[2] = __byte_perm_S (w0[2], w0[3], selector); - w1[1] = __byte_perm_S (w0[1], w0[2], selector); - w1[0] = __byte_perm_S (w0[0], w0[1], selector); - w0[3] = __byte_perm_S ( 0, w0[0], selector); - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 3: + w[63] = __byte_perm_S (w[59], w[60], selector); + w[62] = __byte_perm_S (w[58], w[59], selector); + w[61] = __byte_perm_S (w[57], w[58], selector); + w[60] = __byte_perm_S (w[56], w[57], selector); + w[59] = __byte_perm_S (w[55], w[56], selector); + w[58] = __byte_perm_S (w[54], w[55], selector); + w[57] = __byte_perm_S (w[53], w[54], selector); + w[56] = __byte_perm_S (w[52], w[53], selector); + w[55] = __byte_perm_S (w[51], w[52], selector); + w[54] = __byte_perm_S (w[50], w[51], selector); + w[53] = __byte_perm_S (w[49], w[50], selector); + w[52] = __byte_perm_S (w[48], w[49], selector); + w[51] = __byte_perm_S (w[47], w[48], selector); + w[50] = __byte_perm_S (w[46], w[47], selector); + w[49] = __byte_perm_S (w[45], w[46], selector); + w[48] = __byte_perm_S (w[44], w[45], selector); + w[47] = __byte_perm_S (w[43], w[44], selector); + w[46] = __byte_perm_S (w[42], w[43], selector); + w[45] = __byte_perm_S (w[41], w[42], selector); + w[44] = __byte_perm_S (w[40], w[41], selector); + w[43] = __byte_perm_S (w[39], w[40], selector); + w[42] = __byte_perm_S (w[38], w[39], selector); + w[41] = __byte_perm_S (w[37], w[38], selector); + w[40] = __byte_perm_S (w[36], w[37], selector); + w[39] = __byte_perm_S (w[35], w[36], selector); + w[38] = __byte_perm_S (w[34], w[35], selector); + w[37] = __byte_perm_S (w[33], w[34], selector); + w[36] = __byte_perm_S (w[32], w[33], selector); + w[35] = __byte_perm_S (w[31], w[32], selector); + w[34] = __byte_perm_S (w[30], w[31], selector); + w[33] = __byte_perm_S (w[29], w[30], selector); + w[32] = __byte_perm_S (w[28], w[29], selector); + w[31] = __byte_perm_S (w[27], w[28], selector); + w[30] = __byte_perm_S (w[26], w[27], selector); + w[29] = __byte_perm_S (w[25], w[26], selector); + w[28] = __byte_perm_S (w[24], w[25], selector); + w[27] = __byte_perm_S (w[23], w[24], selector); + w[26] = __byte_perm_S (w[22], w[23], selector); + w[25] = __byte_perm_S (w[21], w[22], selector); + w[24] = __byte_perm_S (w[20], w[21], selector); + w[23] = __byte_perm_S (w[19], w[20], selector); + w[22] = __byte_perm_S (w[18], w[19], selector); + w[21] = __byte_perm_S (w[17], w[18], selector); + w[20] = __byte_perm_S (w[16], w[17], selector); + w[19] = __byte_perm_S (w[15], w[16], selector); + w[18] = __byte_perm_S (w[14], w[15], selector); + w[17] = __byte_perm_S (w[13], w[14], selector); + w[16] = __byte_perm_S (w[12], w[13], selector); + w[15] = __byte_perm_S (w[11], w[12], selector); + w[14] = __byte_perm_S (w[10], w[11], selector); + w[13] = __byte_perm_S (w[ 9], w[10], selector); + w[12] = __byte_perm_S (w[ 8], w[ 9], selector); + w[11] = __byte_perm_S (w[ 7], w[ 8], selector); + w[10] = __byte_perm_S (w[ 6], w[ 7], selector); + w[ 9] = __byte_perm_S (w[ 5], w[ 6], selector); + w[ 8] = __byte_perm_S (w[ 4], w[ 5], selector); + w[ 7] = __byte_perm_S (w[ 3], w[ 4], selector); + w[ 6] = __byte_perm_S (w[ 2], w[ 3], selector); + w[ 5] = __byte_perm_S (w[ 1], w[ 2], selector); + w[ 4] = __byte_perm_S (w[ 0], w[ 1], selector); + w[ 3] = __byte_perm_S ( 0, w[ 0], selector); + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; break; - - case 4: - w3[3] = __byte_perm_S (w2[2], w2[3], selector); - w3[2] = __byte_perm_S (w2[1], w2[2], selector); - w3[1] = __byte_perm_S (w2[0], w2[1], selector); - w3[0] = __byte_perm_S (w1[3], w2[0], selector); - w2[3] = __byte_perm_S (w1[2], w1[3], selector); - w2[2] = __byte_perm_S (w1[1], w1[2], selector); - w2[1] = __byte_perm_S (w1[0], w1[1], selector); - w2[0] = __byte_perm_S (w0[3], w1[0], selector); - w1[3] = __byte_perm_S (w0[2], w0[3], selector); - w1[2] = __byte_perm_S (w0[1], w0[2], selector); - w1[1] = __byte_perm_S (w0[0], w0[1], selector); - w1[0] = __byte_perm_S ( 0, w0[0], selector); - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 4: + w[63] = __byte_perm_S (w[58], w[59], selector); + w[62] = __byte_perm_S (w[57], w[58], selector); + w[61] = __byte_perm_S (w[56], w[57], selector); + w[60] = __byte_perm_S (w[55], w[56], selector); + w[59] = __byte_perm_S (w[54], w[55], selector); + w[58] = __byte_perm_S (w[53], w[54], selector); + w[57] = __byte_perm_S (w[52], w[53], selector); + w[56] = __byte_perm_S (w[51], w[52], selector); + w[55] = __byte_perm_S (w[50], w[51], selector); + w[54] = __byte_perm_S (w[49], w[50], selector); + w[53] = __byte_perm_S (w[48], w[49], selector); + w[52] = __byte_perm_S (w[47], w[48], selector); + w[51] = __byte_perm_S (w[46], w[47], selector); + w[50] = __byte_perm_S (w[45], w[46], selector); + w[49] = __byte_perm_S (w[44], w[45], selector); + w[48] = __byte_perm_S (w[43], w[44], selector); + w[47] = __byte_perm_S (w[42], w[43], selector); + w[46] = __byte_perm_S (w[41], w[42], selector); + w[45] = __byte_perm_S (w[40], w[41], selector); + w[44] = __byte_perm_S (w[39], w[40], selector); + w[43] = __byte_perm_S (w[38], w[39], selector); + w[42] = __byte_perm_S (w[37], w[38], selector); + w[41] = __byte_perm_S (w[36], w[37], selector); + w[40] = __byte_perm_S (w[35], w[36], selector); + w[39] = __byte_perm_S (w[34], w[35], selector); + w[38] = __byte_perm_S (w[33], w[34], selector); + w[37] = __byte_perm_S (w[32], w[33], selector); + w[36] = __byte_perm_S (w[31], w[32], selector); + w[35] = __byte_perm_S (w[30], w[31], selector); + w[34] = __byte_perm_S (w[29], w[30], selector); + w[33] = __byte_perm_S (w[28], w[29], selector); + w[32] = __byte_perm_S (w[27], w[28], selector); + w[31] = __byte_perm_S (w[26], w[27], selector); + w[30] = __byte_perm_S (w[25], w[26], selector); + w[29] = __byte_perm_S (w[24], w[25], selector); + w[28] = __byte_perm_S (w[23], w[24], selector); + w[27] = __byte_perm_S (w[22], w[23], selector); + w[26] = __byte_perm_S (w[21], w[22], selector); + w[25] = __byte_perm_S (w[20], w[21], selector); + w[24] = __byte_perm_S (w[19], w[20], selector); + w[23] = __byte_perm_S (w[18], w[19], selector); + w[22] = __byte_perm_S (w[17], w[18], selector); + w[21] = __byte_perm_S (w[16], w[17], selector); + w[20] = __byte_perm_S (w[15], w[16], selector); + w[19] = __byte_perm_S (w[14], w[15], selector); + w[18] = __byte_perm_S (w[13], w[14], selector); + w[17] = __byte_perm_S (w[12], w[13], selector); + w[16] = __byte_perm_S (w[11], w[12], selector); + w[15] = __byte_perm_S (w[10], w[11], selector); + w[14] = __byte_perm_S (w[ 9], w[10], selector); + w[13] = __byte_perm_S (w[ 8], w[ 9], selector); + w[12] = __byte_perm_S (w[ 7], w[ 8], selector); + w[11] = __byte_perm_S (w[ 6], w[ 7], selector); + w[10] = __byte_perm_S (w[ 5], w[ 6], selector); + w[ 9] = __byte_perm_S (w[ 4], w[ 5], selector); + w[ 8] = __byte_perm_S (w[ 3], w[ 4], selector); + w[ 7] = __byte_perm_S (w[ 2], w[ 3], selector); + w[ 6] = __byte_perm_S (w[ 1], w[ 2], selector); + w[ 5] = __byte_perm_S (w[ 0], w[ 1], selector); + w[ 4] = __byte_perm_S ( 0, w[ 0], selector); + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; break; - - case 5: - w3[3] = __byte_perm_S (w2[1], w2[2], selector); - w3[2] = __byte_perm_S (w2[0], w2[1], selector); - w3[1] = __byte_perm_S (w1[3], w2[0], selector); - w3[0] = __byte_perm_S (w1[2], w1[3], selector); - w2[3] = __byte_perm_S (w1[1], w1[2], selector); - w2[2] = __byte_perm_S (w1[0], w1[1], selector); - w2[1] = __byte_perm_S (w0[3], w1[0], selector); - w2[0] = __byte_perm_S (w0[2], w0[3], selector); - w1[3] = __byte_perm_S (w0[1], w0[2], selector); - w1[2] = __byte_perm_S (w0[0], w0[1], selector); - w1[1] = __byte_perm_S ( 0, w0[0], selector); - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 5: + w[63] = __byte_perm_S (w[57], w[58], selector); + w[62] = __byte_perm_S (w[56], w[57], selector); + w[61] = __byte_perm_S (w[55], w[56], selector); + w[60] = __byte_perm_S (w[54], w[55], selector); + w[59] = __byte_perm_S (w[53], w[54], selector); + w[58] = __byte_perm_S (w[52], w[53], selector); + w[57] = __byte_perm_S (w[51], w[52], selector); + w[56] = __byte_perm_S (w[50], w[51], selector); + w[55] = __byte_perm_S (w[49], w[50], selector); + w[54] = __byte_perm_S (w[48], w[49], selector); + w[53] = __byte_perm_S (w[47], w[48], selector); + w[52] = __byte_perm_S (w[46], w[47], selector); + w[51] = __byte_perm_S (w[45], w[46], selector); + w[50] = __byte_perm_S (w[44], w[45], selector); + w[49] = __byte_perm_S (w[43], w[44], selector); + w[48] = __byte_perm_S (w[42], w[43], selector); + w[47] = __byte_perm_S (w[41], w[42], selector); + w[46] = __byte_perm_S (w[40], w[41], selector); + w[45] = __byte_perm_S (w[39], w[40], selector); + w[44] = __byte_perm_S (w[38], w[39], selector); + w[43] = __byte_perm_S (w[37], w[38], selector); + w[42] = __byte_perm_S (w[36], w[37], selector); + w[41] = __byte_perm_S (w[35], w[36], selector); + w[40] = __byte_perm_S (w[34], w[35], selector); + w[39] = __byte_perm_S (w[33], w[34], selector); + w[38] = __byte_perm_S (w[32], w[33], selector); + w[37] = __byte_perm_S (w[31], w[32], selector); + w[36] = __byte_perm_S (w[30], w[31], selector); + w[35] = __byte_perm_S (w[29], w[30], selector); + w[34] = __byte_perm_S (w[28], w[29], selector); + w[33] = __byte_perm_S (w[27], w[28], selector); + w[32] = __byte_perm_S (w[26], w[27], selector); + w[31] = __byte_perm_S (w[25], w[26], selector); + w[30] = __byte_perm_S (w[24], w[25], selector); + w[29] = __byte_perm_S (w[23], w[24], selector); + w[28] = __byte_perm_S (w[22], w[23], selector); + w[27] = __byte_perm_S (w[21], w[22], selector); + w[26] = __byte_perm_S (w[20], w[21], selector); + w[25] = __byte_perm_S (w[19], w[20], selector); + w[24] = __byte_perm_S (w[18], w[19], selector); + w[23] = __byte_perm_S (w[17], w[18], selector); + w[22] = __byte_perm_S (w[16], w[17], selector); + w[21] = __byte_perm_S (w[15], w[16], selector); + w[20] = __byte_perm_S (w[14], w[15], selector); + w[19] = __byte_perm_S (w[13], w[14], selector); + w[18] = __byte_perm_S (w[12], w[13], selector); + w[17] = __byte_perm_S (w[11], w[12], selector); + w[16] = __byte_perm_S (w[10], w[11], selector); + w[15] = __byte_perm_S (w[ 9], w[10], selector); + w[14] = __byte_perm_S (w[ 8], w[ 9], selector); + w[13] = __byte_perm_S (w[ 7], w[ 8], selector); + w[12] = __byte_perm_S (w[ 6], w[ 7], selector); + w[11] = __byte_perm_S (w[ 5], w[ 6], selector); + w[10] = __byte_perm_S (w[ 4], w[ 5], selector); + w[ 9] = __byte_perm_S (w[ 3], w[ 4], selector); + w[ 8] = __byte_perm_S (w[ 2], w[ 3], selector); + w[ 7] = __byte_perm_S (w[ 1], w[ 2], selector); + w[ 6] = __byte_perm_S (w[ 0], w[ 1], selector); + w[ 5] = __byte_perm_S ( 0, w[ 0], selector); + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; break; - - case 6: - w3[3] = __byte_perm_S (w2[0], w2[1], selector); - w3[2] = __byte_perm_S (w1[3], w2[0], selector); - w3[1] = __byte_perm_S (w1[2], w1[3], selector); - w3[0] = __byte_perm_S (w1[1], w1[2], selector); - w2[3] = __byte_perm_S (w1[0], w1[1], selector); - w2[2] = __byte_perm_S (w0[3], w1[0], selector); - w2[1] = __byte_perm_S (w0[2], w0[3], selector); - w2[0] = __byte_perm_S (w0[1], w0[2], selector); - w1[3] = __byte_perm_S (w0[0], w0[1], selector); - w1[2] = __byte_perm_S ( 0, w0[0], selector); - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 6: + w[63] = __byte_perm_S (w[56], w[57], selector); + w[62] = __byte_perm_S (w[55], w[56], selector); + w[61] = __byte_perm_S (w[54], w[55], selector); + w[60] = __byte_perm_S (w[53], w[54], selector); + w[59] = __byte_perm_S (w[52], w[53], selector); + w[58] = __byte_perm_S (w[51], w[52], selector); + w[57] = __byte_perm_S (w[50], w[51], selector); + w[56] = __byte_perm_S (w[49], w[50], selector); + w[55] = __byte_perm_S (w[48], w[49], selector); + w[54] = __byte_perm_S (w[47], w[48], selector); + w[53] = __byte_perm_S (w[46], w[47], selector); + w[52] = __byte_perm_S (w[45], w[46], selector); + w[51] = __byte_perm_S (w[44], w[45], selector); + w[50] = __byte_perm_S (w[43], w[44], selector); + w[49] = __byte_perm_S (w[42], w[43], selector); + w[48] = __byte_perm_S (w[41], w[42], selector); + w[47] = __byte_perm_S (w[40], w[41], selector); + w[46] = __byte_perm_S (w[39], w[40], selector); + w[45] = __byte_perm_S (w[38], w[39], selector); + w[44] = __byte_perm_S (w[37], w[38], selector); + w[43] = __byte_perm_S (w[36], w[37], selector); + w[42] = __byte_perm_S (w[35], w[36], selector); + w[41] = __byte_perm_S (w[34], w[35], selector); + w[40] = __byte_perm_S (w[33], w[34], selector); + w[39] = __byte_perm_S (w[32], w[33], selector); + w[38] = __byte_perm_S (w[31], w[32], selector); + w[37] = __byte_perm_S (w[30], w[31], selector); + w[36] = __byte_perm_S (w[29], w[30], selector); + w[35] = __byte_perm_S (w[28], w[29], selector); + w[34] = __byte_perm_S (w[27], w[28], selector); + w[33] = __byte_perm_S (w[26], w[27], selector); + w[32] = __byte_perm_S (w[25], w[26], selector); + w[31] = __byte_perm_S (w[24], w[25], selector); + w[30] = __byte_perm_S (w[23], w[24], selector); + w[29] = __byte_perm_S (w[22], w[23], selector); + w[28] = __byte_perm_S (w[21], w[22], selector); + w[27] = __byte_perm_S (w[20], w[21], selector); + w[26] = __byte_perm_S (w[19], w[20], selector); + w[25] = __byte_perm_S (w[18], w[19], selector); + w[24] = __byte_perm_S (w[17], w[18], selector); + w[23] = __byte_perm_S (w[16], w[17], selector); + w[22] = __byte_perm_S (w[15], w[16], selector); + w[21] = __byte_perm_S (w[14], w[15], selector); + w[20] = __byte_perm_S (w[13], w[14], selector); + w[19] = __byte_perm_S (w[12], w[13], selector); + w[18] = __byte_perm_S (w[11], w[12], selector); + w[17] = __byte_perm_S (w[10], w[11], selector); + w[16] = __byte_perm_S (w[ 9], w[10], selector); + w[15] = __byte_perm_S (w[ 8], w[ 9], selector); + w[14] = __byte_perm_S (w[ 7], w[ 8], selector); + w[13] = __byte_perm_S (w[ 6], w[ 7], selector); + w[12] = __byte_perm_S (w[ 5], w[ 6], selector); + w[11] = __byte_perm_S (w[ 4], w[ 5], selector); + w[10] = __byte_perm_S (w[ 3], w[ 4], selector); + w[ 9] = __byte_perm_S (w[ 2], w[ 3], selector); + w[ 8] = __byte_perm_S (w[ 1], w[ 2], selector); + w[ 7] = __byte_perm_S (w[ 0], w[ 1], selector); + w[ 6] = __byte_perm_S ( 0, w[ 0], selector); + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; break; - - case 7: - w3[3] = __byte_perm_S (w1[3], w2[0], selector); - w3[2] = __byte_perm_S (w1[2], w1[3], selector); - w3[1] = __byte_perm_S (w1[1], w1[2], selector); - w3[0] = __byte_perm_S (w1[0], w1[1], selector); - w2[3] = __byte_perm_S (w0[3], w1[0], selector); - w2[2] = __byte_perm_S (w0[2], w0[3], selector); - w2[1] = __byte_perm_S (w0[1], w0[2], selector); - w2[0] = __byte_perm_S (w0[0], w0[1], selector); - w1[3] = __byte_perm_S ( 0, w0[0], selector); - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 7: + w[63] = __byte_perm_S (w[55], w[56], selector); + w[62] = __byte_perm_S (w[54], w[55], selector); + w[61] = __byte_perm_S (w[53], w[54], selector); + w[60] = __byte_perm_S (w[52], w[53], selector); + w[59] = __byte_perm_S (w[51], w[52], selector); + w[58] = __byte_perm_S (w[50], w[51], selector); + w[57] = __byte_perm_S (w[49], w[50], selector); + w[56] = __byte_perm_S (w[48], w[49], selector); + w[55] = __byte_perm_S (w[47], w[48], selector); + w[54] = __byte_perm_S (w[46], w[47], selector); + w[53] = __byte_perm_S (w[45], w[46], selector); + w[52] = __byte_perm_S (w[44], w[45], selector); + w[51] = __byte_perm_S (w[43], w[44], selector); + w[50] = __byte_perm_S (w[42], w[43], selector); + w[49] = __byte_perm_S (w[41], w[42], selector); + w[48] = __byte_perm_S (w[40], w[41], selector); + w[47] = __byte_perm_S (w[39], w[40], selector); + w[46] = __byte_perm_S (w[38], w[39], selector); + w[45] = __byte_perm_S (w[37], w[38], selector); + w[44] = __byte_perm_S (w[36], w[37], selector); + w[43] = __byte_perm_S (w[35], w[36], selector); + w[42] = __byte_perm_S (w[34], w[35], selector); + w[41] = __byte_perm_S (w[33], w[34], selector); + w[40] = __byte_perm_S (w[32], w[33], selector); + w[39] = __byte_perm_S (w[31], w[32], selector); + w[38] = __byte_perm_S (w[30], w[31], selector); + w[37] = __byte_perm_S (w[29], w[30], selector); + w[36] = __byte_perm_S (w[28], w[29], selector); + w[35] = __byte_perm_S (w[27], w[28], selector); + w[34] = __byte_perm_S (w[26], w[27], selector); + w[33] = __byte_perm_S (w[25], w[26], selector); + w[32] = __byte_perm_S (w[24], w[25], selector); + w[31] = __byte_perm_S (w[23], w[24], selector); + w[30] = __byte_perm_S (w[22], w[23], selector); + w[29] = __byte_perm_S (w[21], w[22], selector); + w[28] = __byte_perm_S (w[20], w[21], selector); + w[27] = __byte_perm_S (w[19], w[20], selector); + w[26] = __byte_perm_S (w[18], w[19], selector); + w[25] = __byte_perm_S (w[17], w[18], selector); + w[24] = __byte_perm_S (w[16], w[17], selector); + w[23] = __byte_perm_S (w[15], w[16], selector); + w[22] = __byte_perm_S (w[14], w[15], selector); + w[21] = __byte_perm_S (w[13], w[14], selector); + w[20] = __byte_perm_S (w[12], w[13], selector); + w[19] = __byte_perm_S (w[11], w[12], selector); + w[18] = __byte_perm_S (w[10], w[11], selector); + w[17] = __byte_perm_S (w[ 9], w[10], selector); + w[16] = __byte_perm_S (w[ 8], w[ 9], selector); + w[15] = __byte_perm_S (w[ 7], w[ 8], selector); + w[14] = __byte_perm_S (w[ 6], w[ 7], selector); + w[13] = __byte_perm_S (w[ 5], w[ 6], selector); + w[12] = __byte_perm_S (w[ 4], w[ 5], selector); + w[11] = __byte_perm_S (w[ 3], w[ 4], selector); + w[10] = __byte_perm_S (w[ 2], w[ 3], selector); + w[ 9] = __byte_perm_S (w[ 1], w[ 2], selector); + w[ 8] = __byte_perm_S (w[ 0], w[ 1], selector); + w[ 7] = __byte_perm_S ( 0, w[ 0], selector); + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; break; - - case 8: - w3[3] = __byte_perm_S (w1[2], w1[3], selector); - w3[2] = __byte_perm_S (w1[1], w1[2], selector); - w3[1] = __byte_perm_S (w1[0], w1[1], selector); - w3[0] = __byte_perm_S (w0[3], w1[0], selector); - w2[3] = __byte_perm_S (w0[2], w0[3], selector); - w2[2] = __byte_perm_S (w0[1], w0[2], selector); - w2[1] = __byte_perm_S (w0[0], w0[1], selector); - w2[0] = __byte_perm_S ( 0, w0[0], selector); - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 8: + w[63] = __byte_perm_S (w[54], w[55], selector); + w[62] = __byte_perm_S (w[53], w[54], selector); + w[61] = __byte_perm_S (w[52], w[53], selector); + w[60] = __byte_perm_S (w[51], w[52], selector); + w[59] = __byte_perm_S (w[50], w[51], selector); + w[58] = __byte_perm_S (w[49], w[50], selector); + w[57] = __byte_perm_S (w[48], w[49], selector); + w[56] = __byte_perm_S (w[47], w[48], selector); + w[55] = __byte_perm_S (w[46], w[47], selector); + w[54] = __byte_perm_S (w[45], w[46], selector); + w[53] = __byte_perm_S (w[44], w[45], selector); + w[52] = __byte_perm_S (w[43], w[44], selector); + w[51] = __byte_perm_S (w[42], w[43], selector); + w[50] = __byte_perm_S (w[41], w[42], selector); + w[49] = __byte_perm_S (w[40], w[41], selector); + w[48] = __byte_perm_S (w[39], w[40], selector); + w[47] = __byte_perm_S (w[38], w[39], selector); + w[46] = __byte_perm_S (w[37], w[38], selector); + w[45] = __byte_perm_S (w[36], w[37], selector); + w[44] = __byte_perm_S (w[35], w[36], selector); + w[43] = __byte_perm_S (w[34], w[35], selector); + w[42] = __byte_perm_S (w[33], w[34], selector); + w[41] = __byte_perm_S (w[32], w[33], selector); + w[40] = __byte_perm_S (w[31], w[32], selector); + w[39] = __byte_perm_S (w[30], w[31], selector); + w[38] = __byte_perm_S (w[29], w[30], selector); + w[37] = __byte_perm_S (w[28], w[29], selector); + w[36] = __byte_perm_S (w[27], w[28], selector); + w[35] = __byte_perm_S (w[26], w[27], selector); + w[34] = __byte_perm_S (w[25], w[26], selector); + w[33] = __byte_perm_S (w[24], w[25], selector); + w[32] = __byte_perm_S (w[23], w[24], selector); + w[31] = __byte_perm_S (w[22], w[23], selector); + w[30] = __byte_perm_S (w[21], w[22], selector); + w[29] = __byte_perm_S (w[20], w[21], selector); + w[28] = __byte_perm_S (w[19], w[20], selector); + w[27] = __byte_perm_S (w[18], w[19], selector); + w[26] = __byte_perm_S (w[17], w[18], selector); + w[25] = __byte_perm_S (w[16], w[17], selector); + w[24] = __byte_perm_S (w[15], w[16], selector); + w[23] = __byte_perm_S (w[14], w[15], selector); + w[22] = __byte_perm_S (w[13], w[14], selector); + w[21] = __byte_perm_S (w[12], w[13], selector); + w[20] = __byte_perm_S (w[11], w[12], selector); + w[19] = __byte_perm_S (w[10], w[11], selector); + w[18] = __byte_perm_S (w[ 9], w[10], selector); + w[17] = __byte_perm_S (w[ 8], w[ 9], selector); + w[16] = __byte_perm_S (w[ 7], w[ 8], selector); + w[15] = __byte_perm_S (w[ 6], w[ 7], selector); + w[14] = __byte_perm_S (w[ 5], w[ 6], selector); + w[13] = __byte_perm_S (w[ 4], w[ 5], selector); + w[12] = __byte_perm_S (w[ 3], w[ 4], selector); + w[11] = __byte_perm_S (w[ 2], w[ 3], selector); + w[10] = __byte_perm_S (w[ 1], w[ 2], selector); + w[ 9] = __byte_perm_S (w[ 0], w[ 1], selector); + w[ 8] = __byte_perm_S ( 0, w[ 0], selector); + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; break; - - case 9: - w3[3] = __byte_perm_S (w1[1], w1[2], selector); - w3[2] = __byte_perm_S (w1[0], w1[1], selector); - w3[1] = __byte_perm_S (w0[3], w1[0], selector); - w3[0] = __byte_perm_S (w0[2], w0[3], selector); - w2[3] = __byte_perm_S (w0[1], w0[2], selector); - w2[2] = __byte_perm_S (w0[0], w0[1], selector); - w2[1] = __byte_perm_S ( 0, w0[0], selector); - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + case 9: + w[63] = __byte_perm_S (w[53], w[54], selector); + w[62] = __byte_perm_S (w[52], w[53], selector); + w[61] = __byte_perm_S (w[51], w[52], selector); + w[60] = __byte_perm_S (w[50], w[51], selector); + w[59] = __byte_perm_S (w[49], w[50], selector); + w[58] = __byte_perm_S (w[48], w[49], selector); + w[57] = __byte_perm_S (w[47], w[48], selector); + w[56] = __byte_perm_S (w[46], w[47], selector); + w[55] = __byte_perm_S (w[45], w[46], selector); + w[54] = __byte_perm_S (w[44], w[45], selector); + w[53] = __byte_perm_S (w[43], w[44], selector); + w[52] = __byte_perm_S (w[42], w[43], selector); + w[51] = __byte_perm_S (w[41], w[42], selector); + w[50] = __byte_perm_S (w[40], w[41], selector); + w[49] = __byte_perm_S (w[39], w[40], selector); + w[48] = __byte_perm_S (w[38], w[39], selector); + w[47] = __byte_perm_S (w[37], w[38], selector); + w[46] = __byte_perm_S (w[36], w[37], selector); + w[45] = __byte_perm_S (w[35], w[36], selector); + w[44] = __byte_perm_S (w[34], w[35], selector); + w[43] = __byte_perm_S (w[33], w[34], selector); + w[42] = __byte_perm_S (w[32], w[33], selector); + w[41] = __byte_perm_S (w[31], w[32], selector); + w[40] = __byte_perm_S (w[30], w[31], selector); + w[39] = __byte_perm_S (w[29], w[30], selector); + w[38] = __byte_perm_S (w[28], w[29], selector); + w[37] = __byte_perm_S (w[27], w[28], selector); + w[36] = __byte_perm_S (w[26], w[27], selector); + w[35] = __byte_perm_S (w[25], w[26], selector); + w[34] = __byte_perm_S (w[24], w[25], selector); + w[33] = __byte_perm_S (w[23], w[24], selector); + w[32] = __byte_perm_S (w[22], w[23], selector); + w[31] = __byte_perm_S (w[21], w[22], selector); + w[30] = __byte_perm_S (w[20], w[21], selector); + w[29] = __byte_perm_S (w[19], w[20], selector); + w[28] = __byte_perm_S (w[18], w[19], selector); + w[27] = __byte_perm_S (w[17], w[18], selector); + w[26] = __byte_perm_S (w[16], w[17], selector); + w[25] = __byte_perm_S (w[15], w[16], selector); + w[24] = __byte_perm_S (w[14], w[15], selector); + w[23] = __byte_perm_S (w[13], w[14], selector); + w[22] = __byte_perm_S (w[12], w[13], selector); + w[21] = __byte_perm_S (w[11], w[12], selector); + w[20] = __byte_perm_S (w[10], w[11], selector); + w[19] = __byte_perm_S (w[ 9], w[10], selector); + w[18] = __byte_perm_S (w[ 8], w[ 9], selector); + w[17] = __byte_perm_S (w[ 7], w[ 8], selector); + w[16] = __byte_perm_S (w[ 6], w[ 7], selector); + w[15] = __byte_perm_S (w[ 5], w[ 6], selector); + w[14] = __byte_perm_S (w[ 4], w[ 5], selector); + w[13] = __byte_perm_S (w[ 3], w[ 4], selector); + w[12] = __byte_perm_S (w[ 2], w[ 3], selector); + w[11] = __byte_perm_S (w[ 1], w[ 2], selector); + w[10] = __byte_perm_S (w[ 0], w[ 1], selector); + w[ 9] = __byte_perm_S ( 0, w[ 0], selector); + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; break; - case 10: - w3[3] = __byte_perm_S (w1[0], w1[1], selector); - w3[2] = __byte_perm_S (w0[3], w1[0], selector); - w3[1] = __byte_perm_S (w0[2], w0[3], selector); - w3[0] = __byte_perm_S (w0[1], w0[2], selector); - w2[3] = __byte_perm_S (w0[0], w0[1], selector); - w2[2] = __byte_perm_S ( 0, w0[0], selector); - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + w[63] = __byte_perm_S (w[52], w[53], selector); + w[62] = __byte_perm_S (w[51], w[52], selector); + w[61] = __byte_perm_S (w[50], w[51], selector); + w[60] = __byte_perm_S (w[49], w[50], selector); + w[59] = __byte_perm_S (w[48], w[49], selector); + w[58] = __byte_perm_S (w[47], w[48], selector); + w[57] = __byte_perm_S (w[46], w[47], selector); + w[56] = __byte_perm_S (w[45], w[46], selector); + w[55] = __byte_perm_S (w[44], w[45], selector); + w[54] = __byte_perm_S (w[43], w[44], selector); + w[53] = __byte_perm_S (w[42], w[43], selector); + w[52] = __byte_perm_S (w[41], w[42], selector); + w[51] = __byte_perm_S (w[40], w[41], selector); + w[50] = __byte_perm_S (w[39], w[40], selector); + w[49] = __byte_perm_S (w[38], w[39], selector); + w[48] = __byte_perm_S (w[37], w[38], selector); + w[47] = __byte_perm_S (w[36], w[37], selector); + w[46] = __byte_perm_S (w[35], w[36], selector); + w[45] = __byte_perm_S (w[34], w[35], selector); + w[44] = __byte_perm_S (w[33], w[34], selector); + w[43] = __byte_perm_S (w[32], w[33], selector); + w[42] = __byte_perm_S (w[31], w[32], selector); + w[41] = __byte_perm_S (w[30], w[31], selector); + w[40] = __byte_perm_S (w[29], w[30], selector); + w[39] = __byte_perm_S (w[28], w[29], selector); + w[38] = __byte_perm_S (w[27], w[28], selector); + w[37] = __byte_perm_S (w[26], w[27], selector); + w[36] = __byte_perm_S (w[25], w[26], selector); + w[35] = __byte_perm_S (w[24], w[25], selector); + w[34] = __byte_perm_S (w[23], w[24], selector); + w[33] = __byte_perm_S (w[22], w[23], selector); + w[32] = __byte_perm_S (w[21], w[22], selector); + w[31] = __byte_perm_S (w[20], w[21], selector); + w[30] = __byte_perm_S (w[19], w[20], selector); + w[29] = __byte_perm_S (w[18], w[19], selector); + w[28] = __byte_perm_S (w[17], w[18], selector); + w[27] = __byte_perm_S (w[16], w[17], selector); + w[26] = __byte_perm_S (w[15], w[16], selector); + w[25] = __byte_perm_S (w[14], w[15], selector); + w[24] = __byte_perm_S (w[13], w[14], selector); + w[23] = __byte_perm_S (w[12], w[13], selector); + w[22] = __byte_perm_S (w[11], w[12], selector); + w[21] = __byte_perm_S (w[10], w[11], selector); + w[20] = __byte_perm_S (w[ 9], w[10], selector); + w[19] = __byte_perm_S (w[ 8], w[ 9], selector); + w[18] = __byte_perm_S (w[ 7], w[ 8], selector); + w[17] = __byte_perm_S (w[ 6], w[ 7], selector); + w[16] = __byte_perm_S (w[ 5], w[ 6], selector); + w[15] = __byte_perm_S (w[ 4], w[ 5], selector); + w[14] = __byte_perm_S (w[ 3], w[ 4], selector); + w[13] = __byte_perm_S (w[ 2], w[ 3], selector); + w[12] = __byte_perm_S (w[ 1], w[ 2], selector); + w[11] = __byte_perm_S (w[ 0], w[ 1], selector); + w[10] = __byte_perm_S ( 0, w[ 0], selector); + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; break; - case 11: - w3[3] = __byte_perm_S (w0[3], w1[0], selector); - w3[2] = __byte_perm_S (w0[2], w0[3], selector); - w3[1] = __byte_perm_S (w0[1], w0[2], selector); - w3[0] = __byte_perm_S (w0[0], w0[1], selector); - w2[3] = __byte_perm_S ( 0, w0[0], selector); - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + w[63] = __byte_perm_S (w[51], w[52], selector); + w[62] = __byte_perm_S (w[50], w[51], selector); + w[61] = __byte_perm_S (w[49], w[50], selector); + w[60] = __byte_perm_S (w[48], w[49], selector); + w[59] = __byte_perm_S (w[47], w[48], selector); + w[58] = __byte_perm_S (w[46], w[47], selector); + w[57] = __byte_perm_S (w[45], w[46], selector); + w[56] = __byte_perm_S (w[44], w[45], selector); + w[55] = __byte_perm_S (w[43], w[44], selector); + w[54] = __byte_perm_S (w[42], w[43], selector); + w[53] = __byte_perm_S (w[41], w[42], selector); + w[52] = __byte_perm_S (w[40], w[41], selector); + w[51] = __byte_perm_S (w[39], w[40], selector); + w[50] = __byte_perm_S (w[38], w[39], selector); + w[49] = __byte_perm_S (w[37], w[38], selector); + w[48] = __byte_perm_S (w[36], w[37], selector); + w[47] = __byte_perm_S (w[35], w[36], selector); + w[46] = __byte_perm_S (w[34], w[35], selector); + w[45] = __byte_perm_S (w[33], w[34], selector); + w[44] = __byte_perm_S (w[32], w[33], selector); + w[43] = __byte_perm_S (w[31], w[32], selector); + w[42] = __byte_perm_S (w[30], w[31], selector); + w[41] = __byte_perm_S (w[29], w[30], selector); + w[40] = __byte_perm_S (w[28], w[29], selector); + w[39] = __byte_perm_S (w[27], w[28], selector); + w[38] = __byte_perm_S (w[26], w[27], selector); + w[37] = __byte_perm_S (w[25], w[26], selector); + w[36] = __byte_perm_S (w[24], w[25], selector); + w[35] = __byte_perm_S (w[23], w[24], selector); + w[34] = __byte_perm_S (w[22], w[23], selector); + w[33] = __byte_perm_S (w[21], w[22], selector); + w[32] = __byte_perm_S (w[20], w[21], selector); + w[31] = __byte_perm_S (w[19], w[20], selector); + w[30] = __byte_perm_S (w[18], w[19], selector); + w[29] = __byte_perm_S (w[17], w[18], selector); + w[28] = __byte_perm_S (w[16], w[17], selector); + w[27] = __byte_perm_S (w[15], w[16], selector); + w[26] = __byte_perm_S (w[14], w[15], selector); + w[25] = __byte_perm_S (w[13], w[14], selector); + w[24] = __byte_perm_S (w[12], w[13], selector); + w[23] = __byte_perm_S (w[11], w[12], selector); + w[22] = __byte_perm_S (w[10], w[11], selector); + w[21] = __byte_perm_S (w[ 9], w[10], selector); + w[20] = __byte_perm_S (w[ 8], w[ 9], selector); + w[19] = __byte_perm_S (w[ 7], w[ 8], selector); + w[18] = __byte_perm_S (w[ 6], w[ 7], selector); + w[17] = __byte_perm_S (w[ 5], w[ 6], selector); + w[16] = __byte_perm_S (w[ 4], w[ 5], selector); + w[15] = __byte_perm_S (w[ 3], w[ 4], selector); + w[14] = __byte_perm_S (w[ 2], w[ 3], selector); + w[13] = __byte_perm_S (w[ 1], w[ 2], selector); + w[12] = __byte_perm_S (w[ 0], w[ 1], selector); + w[11] = __byte_perm_S ( 0, w[ 0], selector); + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; break; - case 12: - w3[3] = __byte_perm_S (w0[2], w0[3], selector); - w3[2] = __byte_perm_S (w0[1], w0[2], selector); - w3[1] = __byte_perm_S (w0[0], w0[1], selector); - w3[0] = __byte_perm_S ( 0, w0[0], selector); - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + w[63] = __byte_perm_S (w[50], w[51], selector); + w[62] = __byte_perm_S (w[49], w[50], selector); + w[61] = __byte_perm_S (w[48], w[49], selector); + w[60] = __byte_perm_S (w[47], w[48], selector); + w[59] = __byte_perm_S (w[46], w[47], selector); + w[58] = __byte_perm_S (w[45], w[46], selector); + w[57] = __byte_perm_S (w[44], w[45], selector); + w[56] = __byte_perm_S (w[43], w[44], selector); + w[55] = __byte_perm_S (w[42], w[43], selector); + w[54] = __byte_perm_S (w[41], w[42], selector); + w[53] = __byte_perm_S (w[40], w[41], selector); + w[52] = __byte_perm_S (w[39], w[40], selector); + w[51] = __byte_perm_S (w[38], w[39], selector); + w[50] = __byte_perm_S (w[37], w[38], selector); + w[49] = __byte_perm_S (w[36], w[37], selector); + w[48] = __byte_perm_S (w[35], w[36], selector); + w[47] = __byte_perm_S (w[34], w[35], selector); + w[46] = __byte_perm_S (w[33], w[34], selector); + w[45] = __byte_perm_S (w[32], w[33], selector); + w[44] = __byte_perm_S (w[31], w[32], selector); + w[43] = __byte_perm_S (w[30], w[31], selector); + w[42] = __byte_perm_S (w[29], w[30], selector); + w[41] = __byte_perm_S (w[28], w[29], selector); + w[40] = __byte_perm_S (w[27], w[28], selector); + w[39] = __byte_perm_S (w[26], w[27], selector); + w[38] = __byte_perm_S (w[25], w[26], selector); + w[37] = __byte_perm_S (w[24], w[25], selector); + w[36] = __byte_perm_S (w[23], w[24], selector); + w[35] = __byte_perm_S (w[22], w[23], selector); + w[34] = __byte_perm_S (w[21], w[22], selector); + w[33] = __byte_perm_S (w[20], w[21], selector); + w[32] = __byte_perm_S (w[19], w[20], selector); + w[31] = __byte_perm_S (w[18], w[19], selector); + w[30] = __byte_perm_S (w[17], w[18], selector); + w[29] = __byte_perm_S (w[16], w[17], selector); + w[28] = __byte_perm_S (w[15], w[16], selector); + w[27] = __byte_perm_S (w[14], w[15], selector); + w[26] = __byte_perm_S (w[13], w[14], selector); + w[25] = __byte_perm_S (w[12], w[13], selector); + w[24] = __byte_perm_S (w[11], w[12], selector); + w[23] = __byte_perm_S (w[10], w[11], selector); + w[22] = __byte_perm_S (w[ 9], w[10], selector); + w[21] = __byte_perm_S (w[ 8], w[ 9], selector); + w[20] = __byte_perm_S (w[ 7], w[ 8], selector); + w[19] = __byte_perm_S (w[ 6], w[ 7], selector); + w[18] = __byte_perm_S (w[ 5], w[ 6], selector); + w[17] = __byte_perm_S (w[ 4], w[ 5], selector); + w[16] = __byte_perm_S (w[ 3], w[ 4], selector); + w[15] = __byte_perm_S (w[ 2], w[ 3], selector); + w[14] = __byte_perm_S (w[ 1], w[ 2], selector); + w[13] = __byte_perm_S (w[ 0], w[ 1], selector); + w[12] = __byte_perm_S ( 0, w[ 0], selector); + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; break; - case 13: - w3[3] = __byte_perm_S (w0[1], w0[2], selector); - w3[2] = __byte_perm_S (w0[0], w0[1], selector); - w3[1] = __byte_perm_S ( 0, w0[0], selector); - w3[0] = 0; - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + w[63] = __byte_perm_S (w[49], w[50], selector); + w[62] = __byte_perm_S (w[48], w[49], selector); + w[61] = __byte_perm_S (w[47], w[48], selector); + w[60] = __byte_perm_S (w[46], w[47], selector); + w[59] = __byte_perm_S (w[45], w[46], selector); + w[58] = __byte_perm_S (w[44], w[45], selector); + w[57] = __byte_perm_S (w[43], w[44], selector); + w[56] = __byte_perm_S (w[42], w[43], selector); + w[55] = __byte_perm_S (w[41], w[42], selector); + w[54] = __byte_perm_S (w[40], w[41], selector); + w[53] = __byte_perm_S (w[39], w[40], selector); + w[52] = __byte_perm_S (w[38], w[39], selector); + w[51] = __byte_perm_S (w[37], w[38], selector); + w[50] = __byte_perm_S (w[36], w[37], selector); + w[49] = __byte_perm_S (w[35], w[36], selector); + w[48] = __byte_perm_S (w[34], w[35], selector); + w[47] = __byte_perm_S (w[33], w[34], selector); + w[46] = __byte_perm_S (w[32], w[33], selector); + w[45] = __byte_perm_S (w[31], w[32], selector); + w[44] = __byte_perm_S (w[30], w[31], selector); + w[43] = __byte_perm_S (w[29], w[30], selector); + w[42] = __byte_perm_S (w[28], w[29], selector); + w[41] = __byte_perm_S (w[27], w[28], selector); + w[40] = __byte_perm_S (w[26], w[27], selector); + w[39] = __byte_perm_S (w[25], w[26], selector); + w[38] = __byte_perm_S (w[24], w[25], selector); + w[37] = __byte_perm_S (w[23], w[24], selector); + w[36] = __byte_perm_S (w[22], w[23], selector); + w[35] = __byte_perm_S (w[21], w[22], selector); + w[34] = __byte_perm_S (w[20], w[21], selector); + w[33] = __byte_perm_S (w[19], w[20], selector); + w[32] = __byte_perm_S (w[18], w[19], selector); + w[31] = __byte_perm_S (w[17], w[18], selector); + w[30] = __byte_perm_S (w[16], w[17], selector); + w[29] = __byte_perm_S (w[15], w[16], selector); + w[28] = __byte_perm_S (w[14], w[15], selector); + w[27] = __byte_perm_S (w[13], w[14], selector); + w[26] = __byte_perm_S (w[12], w[13], selector); + w[25] = __byte_perm_S (w[11], w[12], selector); + w[24] = __byte_perm_S (w[10], w[11], selector); + w[23] = __byte_perm_S (w[ 9], w[10], selector); + w[22] = __byte_perm_S (w[ 8], w[ 9], selector); + w[21] = __byte_perm_S (w[ 7], w[ 8], selector); + w[20] = __byte_perm_S (w[ 6], w[ 7], selector); + w[19] = __byte_perm_S (w[ 5], w[ 6], selector); + w[18] = __byte_perm_S (w[ 4], w[ 5], selector); + w[17] = __byte_perm_S (w[ 3], w[ 4], selector); + w[16] = __byte_perm_S (w[ 2], w[ 3], selector); + w[15] = __byte_perm_S (w[ 1], w[ 2], selector); + w[14] = __byte_perm_S (w[ 0], w[ 1], selector); + w[13] = __byte_perm_S ( 0, w[ 0], selector); + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; break; - case 14: - w3[3] = __byte_perm_S (w0[0], w0[1], selector); - w3[2] = __byte_perm_S ( 0, w0[0], selector); - w3[1] = 0; - w3[0] = 0; - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + w[63] = __byte_perm_S (w[48], w[49], selector); + w[62] = __byte_perm_S (w[47], w[48], selector); + w[61] = __byte_perm_S (w[46], w[47], selector); + w[60] = __byte_perm_S (w[45], w[46], selector); + w[59] = __byte_perm_S (w[44], w[45], selector); + w[58] = __byte_perm_S (w[43], w[44], selector); + w[57] = __byte_perm_S (w[42], w[43], selector); + w[56] = __byte_perm_S (w[41], w[42], selector); + w[55] = __byte_perm_S (w[40], w[41], selector); + w[54] = __byte_perm_S (w[39], w[40], selector); + w[53] = __byte_perm_S (w[38], w[39], selector); + w[52] = __byte_perm_S (w[37], w[38], selector); + w[51] = __byte_perm_S (w[36], w[37], selector); + w[50] = __byte_perm_S (w[35], w[36], selector); + w[49] = __byte_perm_S (w[34], w[35], selector); + w[48] = __byte_perm_S (w[33], w[34], selector); + w[47] = __byte_perm_S (w[32], w[33], selector); + w[46] = __byte_perm_S (w[31], w[32], selector); + w[45] = __byte_perm_S (w[30], w[31], selector); + w[44] = __byte_perm_S (w[29], w[30], selector); + w[43] = __byte_perm_S (w[28], w[29], selector); + w[42] = __byte_perm_S (w[27], w[28], selector); + w[41] = __byte_perm_S (w[26], w[27], selector); + w[40] = __byte_perm_S (w[25], w[26], selector); + w[39] = __byte_perm_S (w[24], w[25], selector); + w[38] = __byte_perm_S (w[23], w[24], selector); + w[37] = __byte_perm_S (w[22], w[23], selector); + w[36] = __byte_perm_S (w[21], w[22], selector); + w[35] = __byte_perm_S (w[20], w[21], selector); + w[34] = __byte_perm_S (w[19], w[20], selector); + w[33] = __byte_perm_S (w[18], w[19], selector); + w[32] = __byte_perm_S (w[17], w[18], selector); + w[31] = __byte_perm_S (w[16], w[17], selector); + w[30] = __byte_perm_S (w[15], w[16], selector); + w[29] = __byte_perm_S (w[14], w[15], selector); + w[28] = __byte_perm_S (w[13], w[14], selector); + w[27] = __byte_perm_S (w[12], w[13], selector); + w[26] = __byte_perm_S (w[11], w[12], selector); + w[25] = __byte_perm_S (w[10], w[11], selector); + w[24] = __byte_perm_S (w[ 9], w[10], selector); + w[23] = __byte_perm_S (w[ 8], w[ 9], selector); + w[22] = __byte_perm_S (w[ 7], w[ 8], selector); + w[21] = __byte_perm_S (w[ 6], w[ 7], selector); + w[20] = __byte_perm_S (w[ 5], w[ 6], selector); + w[19] = __byte_perm_S (w[ 4], w[ 5], selector); + w[18] = __byte_perm_S (w[ 3], w[ 4], selector); + w[17] = __byte_perm_S (w[ 2], w[ 3], selector); + w[16] = __byte_perm_S (w[ 1], w[ 2], selector); + w[15] = __byte_perm_S (w[ 0], w[ 1], selector); + w[14] = __byte_perm_S ( 0, w[ 0], selector); + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; break; - case 15: - w3[3] = __byte_perm_S ( 0, w0[0], selector); - w3[2] = 0; - w3[1] = 0; - w3[0] = 0; - w2[3] = 0; - w2[2] = 0; - w2[1] = 0; - w2[0] = 0; - w1[3] = 0; - w1[2] = 0; - w1[1] = 0; - w1[0] = 0; - w0[3] = 0; - w0[2] = 0; - w0[1] = 0; - w0[0] = 0; + w[63] = __byte_perm_S (w[47], w[48], selector); + w[62] = __byte_perm_S (w[46], w[47], selector); + w[61] = __byte_perm_S (w[45], w[46], selector); + w[60] = __byte_perm_S (w[44], w[45], selector); + w[59] = __byte_perm_S (w[43], w[44], selector); + w[58] = __byte_perm_S (w[42], w[43], selector); + w[57] = __byte_perm_S (w[41], w[42], selector); + w[56] = __byte_perm_S (w[40], w[41], selector); + w[55] = __byte_perm_S (w[39], w[40], selector); + w[54] = __byte_perm_S (w[38], w[39], selector); + w[53] = __byte_perm_S (w[37], w[38], selector); + w[52] = __byte_perm_S (w[36], w[37], selector); + w[51] = __byte_perm_S (w[35], w[36], selector); + w[50] = __byte_perm_S (w[34], w[35], selector); + w[49] = __byte_perm_S (w[33], w[34], selector); + w[48] = __byte_perm_S (w[32], w[33], selector); + w[47] = __byte_perm_S (w[31], w[32], selector); + w[46] = __byte_perm_S (w[30], w[31], selector); + w[45] = __byte_perm_S (w[29], w[30], selector); + w[44] = __byte_perm_S (w[28], w[29], selector); + w[43] = __byte_perm_S (w[27], w[28], selector); + w[42] = __byte_perm_S (w[26], w[27], selector); + w[41] = __byte_perm_S (w[25], w[26], selector); + w[40] = __byte_perm_S (w[24], w[25], selector); + w[39] = __byte_perm_S (w[23], w[24], selector); + w[38] = __byte_perm_S (w[22], w[23], selector); + w[37] = __byte_perm_S (w[21], w[22], selector); + w[36] = __byte_perm_S (w[20], w[21], selector); + w[35] = __byte_perm_S (w[19], w[20], selector); + w[34] = __byte_perm_S (w[18], w[19], selector); + w[33] = __byte_perm_S (w[17], w[18], selector); + w[32] = __byte_perm_S (w[16], w[17], selector); + w[31] = __byte_perm_S (w[15], w[16], selector); + w[30] = __byte_perm_S (w[14], w[15], selector); + w[29] = __byte_perm_S (w[13], w[14], selector); + w[28] = __byte_perm_S (w[12], w[13], selector); + w[27] = __byte_perm_S (w[11], w[12], selector); + w[26] = __byte_perm_S (w[10], w[11], selector); + w[25] = __byte_perm_S (w[ 9], w[10], selector); + w[24] = __byte_perm_S (w[ 8], w[ 9], selector); + w[23] = __byte_perm_S (w[ 7], w[ 8], selector); + w[22] = __byte_perm_S (w[ 6], w[ 7], selector); + w[21] = __byte_perm_S (w[ 5], w[ 6], selector); + w[20] = __byte_perm_S (w[ 4], w[ 5], selector); + w[19] = __byte_perm_S (w[ 3], w[ 4], selector); + w[18] = __byte_perm_S (w[ 2], w[ 3], selector); + w[17] = __byte_perm_S (w[ 1], w[ 2], selector); + w[16] = __byte_perm_S (w[ 0], w[ 1], selector); + w[15] = __byte_perm_S ( 0, w[ 0], selector); + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 16: + w[63] = __byte_perm_S (w[46], w[47], selector); + w[62] = __byte_perm_S (w[45], w[46], selector); + w[61] = __byte_perm_S (w[44], w[45], selector); + w[60] = __byte_perm_S (w[43], w[44], selector); + w[59] = __byte_perm_S (w[42], w[43], selector); + w[58] = __byte_perm_S (w[41], w[42], selector); + w[57] = __byte_perm_S (w[40], w[41], selector); + w[56] = __byte_perm_S (w[39], w[40], selector); + w[55] = __byte_perm_S (w[38], w[39], selector); + w[54] = __byte_perm_S (w[37], w[38], selector); + w[53] = __byte_perm_S (w[36], w[37], selector); + w[52] = __byte_perm_S (w[35], w[36], selector); + w[51] = __byte_perm_S (w[34], w[35], selector); + w[50] = __byte_perm_S (w[33], w[34], selector); + w[49] = __byte_perm_S (w[32], w[33], selector); + w[48] = __byte_perm_S (w[31], w[32], selector); + w[47] = __byte_perm_S (w[30], w[31], selector); + w[46] = __byte_perm_S (w[29], w[30], selector); + w[45] = __byte_perm_S (w[28], w[29], selector); + w[44] = __byte_perm_S (w[27], w[28], selector); + w[43] = __byte_perm_S (w[26], w[27], selector); + w[42] = __byte_perm_S (w[25], w[26], selector); + w[41] = __byte_perm_S (w[24], w[25], selector); + w[40] = __byte_perm_S (w[23], w[24], selector); + w[39] = __byte_perm_S (w[22], w[23], selector); + w[38] = __byte_perm_S (w[21], w[22], selector); + w[37] = __byte_perm_S (w[20], w[21], selector); + w[36] = __byte_perm_S (w[19], w[20], selector); + w[35] = __byte_perm_S (w[18], w[19], selector); + w[34] = __byte_perm_S (w[17], w[18], selector); + w[33] = __byte_perm_S (w[16], w[17], selector); + w[32] = __byte_perm_S (w[15], w[16], selector); + w[31] = __byte_perm_S (w[14], w[15], selector); + w[30] = __byte_perm_S (w[13], w[14], selector); + w[29] = __byte_perm_S (w[12], w[13], selector); + w[28] = __byte_perm_S (w[11], w[12], selector); + w[27] = __byte_perm_S (w[10], w[11], selector); + w[26] = __byte_perm_S (w[ 9], w[10], selector); + w[25] = __byte_perm_S (w[ 8], w[ 9], selector); + w[24] = __byte_perm_S (w[ 7], w[ 8], selector); + w[23] = __byte_perm_S (w[ 6], w[ 7], selector); + w[22] = __byte_perm_S (w[ 5], w[ 6], selector); + w[21] = __byte_perm_S (w[ 4], w[ 5], selector); + w[20] = __byte_perm_S (w[ 3], w[ 4], selector); + w[19] = __byte_perm_S (w[ 2], w[ 3], selector); + w[18] = __byte_perm_S (w[ 1], w[ 2], selector); + w[17] = __byte_perm_S (w[ 0], w[ 1], selector); + w[16] = __byte_perm_S ( 0, w[ 0], selector); + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 17: + w[63] = __byte_perm_S (w[45], w[46], selector); + w[62] = __byte_perm_S (w[44], w[45], selector); + w[61] = __byte_perm_S (w[43], w[44], selector); + w[60] = __byte_perm_S (w[42], w[43], selector); + w[59] = __byte_perm_S (w[41], w[42], selector); + w[58] = __byte_perm_S (w[40], w[41], selector); + w[57] = __byte_perm_S (w[39], w[40], selector); + w[56] = __byte_perm_S (w[38], w[39], selector); + w[55] = __byte_perm_S (w[37], w[38], selector); + w[54] = __byte_perm_S (w[36], w[37], selector); + w[53] = __byte_perm_S (w[35], w[36], selector); + w[52] = __byte_perm_S (w[34], w[35], selector); + w[51] = __byte_perm_S (w[33], w[34], selector); + w[50] = __byte_perm_S (w[32], w[33], selector); + w[49] = __byte_perm_S (w[31], w[32], selector); + w[48] = __byte_perm_S (w[30], w[31], selector); + w[47] = __byte_perm_S (w[29], w[30], selector); + w[46] = __byte_perm_S (w[28], w[29], selector); + w[45] = __byte_perm_S (w[27], w[28], selector); + w[44] = __byte_perm_S (w[26], w[27], selector); + w[43] = __byte_perm_S (w[25], w[26], selector); + w[42] = __byte_perm_S (w[24], w[25], selector); + w[41] = __byte_perm_S (w[23], w[24], selector); + w[40] = __byte_perm_S (w[22], w[23], selector); + w[39] = __byte_perm_S (w[21], w[22], selector); + w[38] = __byte_perm_S (w[20], w[21], selector); + w[37] = __byte_perm_S (w[19], w[20], selector); + w[36] = __byte_perm_S (w[18], w[19], selector); + w[35] = __byte_perm_S (w[17], w[18], selector); + w[34] = __byte_perm_S (w[16], w[17], selector); + w[33] = __byte_perm_S (w[15], w[16], selector); + w[32] = __byte_perm_S (w[14], w[15], selector); + w[31] = __byte_perm_S (w[13], w[14], selector); + w[30] = __byte_perm_S (w[12], w[13], selector); + w[29] = __byte_perm_S (w[11], w[12], selector); + w[28] = __byte_perm_S (w[10], w[11], selector); + w[27] = __byte_perm_S (w[ 9], w[10], selector); + w[26] = __byte_perm_S (w[ 8], w[ 9], selector); + w[25] = __byte_perm_S (w[ 7], w[ 8], selector); + w[24] = __byte_perm_S (w[ 6], w[ 7], selector); + w[23] = __byte_perm_S (w[ 5], w[ 6], selector); + w[22] = __byte_perm_S (w[ 4], w[ 5], selector); + w[21] = __byte_perm_S (w[ 3], w[ 4], selector); + w[20] = __byte_perm_S (w[ 2], w[ 3], selector); + w[19] = __byte_perm_S (w[ 1], w[ 2], selector); + w[18] = __byte_perm_S (w[ 0], w[ 1], selector); + w[17] = __byte_perm_S ( 0, w[ 0], selector); + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 18: + w[63] = __byte_perm_S (w[44], w[45], selector); + w[62] = __byte_perm_S (w[43], w[44], selector); + w[61] = __byte_perm_S (w[42], w[43], selector); + w[60] = __byte_perm_S (w[41], w[42], selector); + w[59] = __byte_perm_S (w[40], w[41], selector); + w[58] = __byte_perm_S (w[39], w[40], selector); + w[57] = __byte_perm_S (w[38], w[39], selector); + w[56] = __byte_perm_S (w[37], w[38], selector); + w[55] = __byte_perm_S (w[36], w[37], selector); + w[54] = __byte_perm_S (w[35], w[36], selector); + w[53] = __byte_perm_S (w[34], w[35], selector); + w[52] = __byte_perm_S (w[33], w[34], selector); + w[51] = __byte_perm_S (w[32], w[33], selector); + w[50] = __byte_perm_S (w[31], w[32], selector); + w[49] = __byte_perm_S (w[30], w[31], selector); + w[48] = __byte_perm_S (w[29], w[30], selector); + w[47] = __byte_perm_S (w[28], w[29], selector); + w[46] = __byte_perm_S (w[27], w[28], selector); + w[45] = __byte_perm_S (w[26], w[27], selector); + w[44] = __byte_perm_S (w[25], w[26], selector); + w[43] = __byte_perm_S (w[24], w[25], selector); + w[42] = __byte_perm_S (w[23], w[24], selector); + w[41] = __byte_perm_S (w[22], w[23], selector); + w[40] = __byte_perm_S (w[21], w[22], selector); + w[39] = __byte_perm_S (w[20], w[21], selector); + w[38] = __byte_perm_S (w[19], w[20], selector); + w[37] = __byte_perm_S (w[18], w[19], selector); + w[36] = __byte_perm_S (w[17], w[18], selector); + w[35] = __byte_perm_S (w[16], w[17], selector); + w[34] = __byte_perm_S (w[15], w[16], selector); + w[33] = __byte_perm_S (w[14], w[15], selector); + w[32] = __byte_perm_S (w[13], w[14], selector); + w[31] = __byte_perm_S (w[12], w[13], selector); + w[30] = __byte_perm_S (w[11], w[12], selector); + w[29] = __byte_perm_S (w[10], w[11], selector); + w[28] = __byte_perm_S (w[ 9], w[10], selector); + w[27] = __byte_perm_S (w[ 8], w[ 9], selector); + w[26] = __byte_perm_S (w[ 7], w[ 8], selector); + w[25] = __byte_perm_S (w[ 6], w[ 7], selector); + w[24] = __byte_perm_S (w[ 5], w[ 6], selector); + w[23] = __byte_perm_S (w[ 4], w[ 5], selector); + w[22] = __byte_perm_S (w[ 3], w[ 4], selector); + w[21] = __byte_perm_S (w[ 2], w[ 3], selector); + w[20] = __byte_perm_S (w[ 1], w[ 2], selector); + w[19] = __byte_perm_S (w[ 0], w[ 1], selector); + w[18] = __byte_perm_S ( 0, w[ 0], selector); + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 19: + w[63] = __byte_perm_S (w[43], w[44], selector); + w[62] = __byte_perm_S (w[42], w[43], selector); + w[61] = __byte_perm_S (w[41], w[42], selector); + w[60] = __byte_perm_S (w[40], w[41], selector); + w[59] = __byte_perm_S (w[39], w[40], selector); + w[58] = __byte_perm_S (w[38], w[39], selector); + w[57] = __byte_perm_S (w[37], w[38], selector); + w[56] = __byte_perm_S (w[36], w[37], selector); + w[55] = __byte_perm_S (w[35], w[36], selector); + w[54] = __byte_perm_S (w[34], w[35], selector); + w[53] = __byte_perm_S (w[33], w[34], selector); + w[52] = __byte_perm_S (w[32], w[33], selector); + w[51] = __byte_perm_S (w[31], w[32], selector); + w[50] = __byte_perm_S (w[30], w[31], selector); + w[49] = __byte_perm_S (w[29], w[30], selector); + w[48] = __byte_perm_S (w[28], w[29], selector); + w[47] = __byte_perm_S (w[27], w[28], selector); + w[46] = __byte_perm_S (w[26], w[27], selector); + w[45] = __byte_perm_S (w[25], w[26], selector); + w[44] = __byte_perm_S (w[24], w[25], selector); + w[43] = __byte_perm_S (w[23], w[24], selector); + w[42] = __byte_perm_S (w[22], w[23], selector); + w[41] = __byte_perm_S (w[21], w[22], selector); + w[40] = __byte_perm_S (w[20], w[21], selector); + w[39] = __byte_perm_S (w[19], w[20], selector); + w[38] = __byte_perm_S (w[18], w[19], selector); + w[37] = __byte_perm_S (w[17], w[18], selector); + w[36] = __byte_perm_S (w[16], w[17], selector); + w[35] = __byte_perm_S (w[15], w[16], selector); + w[34] = __byte_perm_S (w[14], w[15], selector); + w[33] = __byte_perm_S (w[13], w[14], selector); + w[32] = __byte_perm_S (w[12], w[13], selector); + w[31] = __byte_perm_S (w[11], w[12], selector); + w[30] = __byte_perm_S (w[10], w[11], selector); + w[29] = __byte_perm_S (w[ 9], w[10], selector); + w[28] = __byte_perm_S (w[ 8], w[ 9], selector); + w[27] = __byte_perm_S (w[ 7], w[ 8], selector); + w[26] = __byte_perm_S (w[ 6], w[ 7], selector); + w[25] = __byte_perm_S (w[ 5], w[ 6], selector); + w[24] = __byte_perm_S (w[ 4], w[ 5], selector); + w[23] = __byte_perm_S (w[ 3], w[ 4], selector); + w[22] = __byte_perm_S (w[ 2], w[ 3], selector); + w[21] = __byte_perm_S (w[ 1], w[ 2], selector); + w[20] = __byte_perm_S (w[ 0], w[ 1], selector); + w[19] = __byte_perm_S ( 0, w[ 0], selector); + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 20: + w[63] = __byte_perm_S (w[42], w[43], selector); + w[62] = __byte_perm_S (w[41], w[42], selector); + w[61] = __byte_perm_S (w[40], w[41], selector); + w[60] = __byte_perm_S (w[39], w[40], selector); + w[59] = __byte_perm_S (w[38], w[39], selector); + w[58] = __byte_perm_S (w[37], w[38], selector); + w[57] = __byte_perm_S (w[36], w[37], selector); + w[56] = __byte_perm_S (w[35], w[36], selector); + w[55] = __byte_perm_S (w[34], w[35], selector); + w[54] = __byte_perm_S (w[33], w[34], selector); + w[53] = __byte_perm_S (w[32], w[33], selector); + w[52] = __byte_perm_S (w[31], w[32], selector); + w[51] = __byte_perm_S (w[30], w[31], selector); + w[50] = __byte_perm_S (w[29], w[30], selector); + w[49] = __byte_perm_S (w[28], w[29], selector); + w[48] = __byte_perm_S (w[27], w[28], selector); + w[47] = __byte_perm_S (w[26], w[27], selector); + w[46] = __byte_perm_S (w[25], w[26], selector); + w[45] = __byte_perm_S (w[24], w[25], selector); + w[44] = __byte_perm_S (w[23], w[24], selector); + w[43] = __byte_perm_S (w[22], w[23], selector); + w[42] = __byte_perm_S (w[21], w[22], selector); + w[41] = __byte_perm_S (w[20], w[21], selector); + w[40] = __byte_perm_S (w[19], w[20], selector); + w[39] = __byte_perm_S (w[18], w[19], selector); + w[38] = __byte_perm_S (w[17], w[18], selector); + w[37] = __byte_perm_S (w[16], w[17], selector); + w[36] = __byte_perm_S (w[15], w[16], selector); + w[35] = __byte_perm_S (w[14], w[15], selector); + w[34] = __byte_perm_S (w[13], w[14], selector); + w[33] = __byte_perm_S (w[12], w[13], selector); + w[32] = __byte_perm_S (w[11], w[12], selector); + w[31] = __byte_perm_S (w[10], w[11], selector); + w[30] = __byte_perm_S (w[ 9], w[10], selector); + w[29] = __byte_perm_S (w[ 8], w[ 9], selector); + w[28] = __byte_perm_S (w[ 7], w[ 8], selector); + w[27] = __byte_perm_S (w[ 6], w[ 7], selector); + w[26] = __byte_perm_S (w[ 5], w[ 6], selector); + w[25] = __byte_perm_S (w[ 4], w[ 5], selector); + w[24] = __byte_perm_S (w[ 3], w[ 4], selector); + w[23] = __byte_perm_S (w[ 2], w[ 3], selector); + w[22] = __byte_perm_S (w[ 1], w[ 2], selector); + w[21] = __byte_perm_S (w[ 0], w[ 1], selector); + w[20] = __byte_perm_S ( 0, w[ 0], selector); + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 21: + w[63] = __byte_perm_S (w[41], w[42], selector); + w[62] = __byte_perm_S (w[40], w[41], selector); + w[61] = __byte_perm_S (w[39], w[40], selector); + w[60] = __byte_perm_S (w[38], w[39], selector); + w[59] = __byte_perm_S (w[37], w[38], selector); + w[58] = __byte_perm_S (w[36], w[37], selector); + w[57] = __byte_perm_S (w[35], w[36], selector); + w[56] = __byte_perm_S (w[34], w[35], selector); + w[55] = __byte_perm_S (w[33], w[34], selector); + w[54] = __byte_perm_S (w[32], w[33], selector); + w[53] = __byte_perm_S (w[31], w[32], selector); + w[52] = __byte_perm_S (w[30], w[31], selector); + w[51] = __byte_perm_S (w[29], w[30], selector); + w[50] = __byte_perm_S (w[28], w[29], selector); + w[49] = __byte_perm_S (w[27], w[28], selector); + w[48] = __byte_perm_S (w[26], w[27], selector); + w[47] = __byte_perm_S (w[25], w[26], selector); + w[46] = __byte_perm_S (w[24], w[25], selector); + w[45] = __byte_perm_S (w[23], w[24], selector); + w[44] = __byte_perm_S (w[22], w[23], selector); + w[43] = __byte_perm_S (w[21], w[22], selector); + w[42] = __byte_perm_S (w[20], w[21], selector); + w[41] = __byte_perm_S (w[19], w[20], selector); + w[40] = __byte_perm_S (w[18], w[19], selector); + w[39] = __byte_perm_S (w[17], w[18], selector); + w[38] = __byte_perm_S (w[16], w[17], selector); + w[37] = __byte_perm_S (w[15], w[16], selector); + w[36] = __byte_perm_S (w[14], w[15], selector); + w[35] = __byte_perm_S (w[13], w[14], selector); + w[34] = __byte_perm_S (w[12], w[13], selector); + w[33] = __byte_perm_S (w[11], w[12], selector); + w[32] = __byte_perm_S (w[10], w[11], selector); + w[31] = __byte_perm_S (w[ 9], w[10], selector); + w[30] = __byte_perm_S (w[ 8], w[ 9], selector); + w[29] = __byte_perm_S (w[ 7], w[ 8], selector); + w[28] = __byte_perm_S (w[ 6], w[ 7], selector); + w[27] = __byte_perm_S (w[ 5], w[ 6], selector); + w[26] = __byte_perm_S (w[ 4], w[ 5], selector); + w[25] = __byte_perm_S (w[ 3], w[ 4], selector); + w[24] = __byte_perm_S (w[ 2], w[ 3], selector); + w[23] = __byte_perm_S (w[ 1], w[ 2], selector); + w[22] = __byte_perm_S (w[ 0], w[ 1], selector); + w[21] = __byte_perm_S ( 0, w[ 0], selector); + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 22: + w[63] = __byte_perm_S (w[40], w[41], selector); + w[62] = __byte_perm_S (w[39], w[40], selector); + w[61] = __byte_perm_S (w[38], w[39], selector); + w[60] = __byte_perm_S (w[37], w[38], selector); + w[59] = __byte_perm_S (w[36], w[37], selector); + w[58] = __byte_perm_S (w[35], w[36], selector); + w[57] = __byte_perm_S (w[34], w[35], selector); + w[56] = __byte_perm_S (w[33], w[34], selector); + w[55] = __byte_perm_S (w[32], w[33], selector); + w[54] = __byte_perm_S (w[31], w[32], selector); + w[53] = __byte_perm_S (w[30], w[31], selector); + w[52] = __byte_perm_S (w[29], w[30], selector); + w[51] = __byte_perm_S (w[28], w[29], selector); + w[50] = __byte_perm_S (w[27], w[28], selector); + w[49] = __byte_perm_S (w[26], w[27], selector); + w[48] = __byte_perm_S (w[25], w[26], selector); + w[47] = __byte_perm_S (w[24], w[25], selector); + w[46] = __byte_perm_S (w[23], w[24], selector); + w[45] = __byte_perm_S (w[22], w[23], selector); + w[44] = __byte_perm_S (w[21], w[22], selector); + w[43] = __byte_perm_S (w[20], w[21], selector); + w[42] = __byte_perm_S (w[19], w[20], selector); + w[41] = __byte_perm_S (w[18], w[19], selector); + w[40] = __byte_perm_S (w[17], w[18], selector); + w[39] = __byte_perm_S (w[16], w[17], selector); + w[38] = __byte_perm_S (w[15], w[16], selector); + w[37] = __byte_perm_S (w[14], w[15], selector); + w[36] = __byte_perm_S (w[13], w[14], selector); + w[35] = __byte_perm_S (w[12], w[13], selector); + w[34] = __byte_perm_S (w[11], w[12], selector); + w[33] = __byte_perm_S (w[10], w[11], selector); + w[32] = __byte_perm_S (w[ 9], w[10], selector); + w[31] = __byte_perm_S (w[ 8], w[ 9], selector); + w[30] = __byte_perm_S (w[ 7], w[ 8], selector); + w[29] = __byte_perm_S (w[ 6], w[ 7], selector); + w[28] = __byte_perm_S (w[ 5], w[ 6], selector); + w[27] = __byte_perm_S (w[ 4], w[ 5], selector); + w[26] = __byte_perm_S (w[ 3], w[ 4], selector); + w[25] = __byte_perm_S (w[ 2], w[ 3], selector); + w[24] = __byte_perm_S (w[ 1], w[ 2], selector); + w[23] = __byte_perm_S (w[ 0], w[ 1], selector); + w[22] = __byte_perm_S ( 0, w[ 0], selector); + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 23: + w[63] = __byte_perm_S (w[39], w[40], selector); + w[62] = __byte_perm_S (w[38], w[39], selector); + w[61] = __byte_perm_S (w[37], w[38], selector); + w[60] = __byte_perm_S (w[36], w[37], selector); + w[59] = __byte_perm_S (w[35], w[36], selector); + w[58] = __byte_perm_S (w[34], w[35], selector); + w[57] = __byte_perm_S (w[33], w[34], selector); + w[56] = __byte_perm_S (w[32], w[33], selector); + w[55] = __byte_perm_S (w[31], w[32], selector); + w[54] = __byte_perm_S (w[30], w[31], selector); + w[53] = __byte_perm_S (w[29], w[30], selector); + w[52] = __byte_perm_S (w[28], w[29], selector); + w[51] = __byte_perm_S (w[27], w[28], selector); + w[50] = __byte_perm_S (w[26], w[27], selector); + w[49] = __byte_perm_S (w[25], w[26], selector); + w[48] = __byte_perm_S (w[24], w[25], selector); + w[47] = __byte_perm_S (w[23], w[24], selector); + w[46] = __byte_perm_S (w[22], w[23], selector); + w[45] = __byte_perm_S (w[21], w[22], selector); + w[44] = __byte_perm_S (w[20], w[21], selector); + w[43] = __byte_perm_S (w[19], w[20], selector); + w[42] = __byte_perm_S (w[18], w[19], selector); + w[41] = __byte_perm_S (w[17], w[18], selector); + w[40] = __byte_perm_S (w[16], w[17], selector); + w[39] = __byte_perm_S (w[15], w[16], selector); + w[38] = __byte_perm_S (w[14], w[15], selector); + w[37] = __byte_perm_S (w[13], w[14], selector); + w[36] = __byte_perm_S (w[12], w[13], selector); + w[35] = __byte_perm_S (w[11], w[12], selector); + w[34] = __byte_perm_S (w[10], w[11], selector); + w[33] = __byte_perm_S (w[ 9], w[10], selector); + w[32] = __byte_perm_S (w[ 8], w[ 9], selector); + w[31] = __byte_perm_S (w[ 7], w[ 8], selector); + w[30] = __byte_perm_S (w[ 6], w[ 7], selector); + w[29] = __byte_perm_S (w[ 5], w[ 6], selector); + w[28] = __byte_perm_S (w[ 4], w[ 5], selector); + w[27] = __byte_perm_S (w[ 3], w[ 4], selector); + w[26] = __byte_perm_S (w[ 2], w[ 3], selector); + w[25] = __byte_perm_S (w[ 1], w[ 2], selector); + w[24] = __byte_perm_S (w[ 0], w[ 1], selector); + w[23] = __byte_perm_S ( 0, w[ 0], selector); + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 24: + w[63] = __byte_perm_S (w[38], w[39], selector); + w[62] = __byte_perm_S (w[37], w[38], selector); + w[61] = __byte_perm_S (w[36], w[37], selector); + w[60] = __byte_perm_S (w[35], w[36], selector); + w[59] = __byte_perm_S (w[34], w[35], selector); + w[58] = __byte_perm_S (w[33], w[34], selector); + w[57] = __byte_perm_S (w[32], w[33], selector); + w[56] = __byte_perm_S (w[31], w[32], selector); + w[55] = __byte_perm_S (w[30], w[31], selector); + w[54] = __byte_perm_S (w[29], w[30], selector); + w[53] = __byte_perm_S (w[28], w[29], selector); + w[52] = __byte_perm_S (w[27], w[28], selector); + w[51] = __byte_perm_S (w[26], w[27], selector); + w[50] = __byte_perm_S (w[25], w[26], selector); + w[49] = __byte_perm_S (w[24], w[25], selector); + w[48] = __byte_perm_S (w[23], w[24], selector); + w[47] = __byte_perm_S (w[22], w[23], selector); + w[46] = __byte_perm_S (w[21], w[22], selector); + w[45] = __byte_perm_S (w[20], w[21], selector); + w[44] = __byte_perm_S (w[19], w[20], selector); + w[43] = __byte_perm_S (w[18], w[19], selector); + w[42] = __byte_perm_S (w[17], w[18], selector); + w[41] = __byte_perm_S (w[16], w[17], selector); + w[40] = __byte_perm_S (w[15], w[16], selector); + w[39] = __byte_perm_S (w[14], w[15], selector); + w[38] = __byte_perm_S (w[13], w[14], selector); + w[37] = __byte_perm_S (w[12], w[13], selector); + w[36] = __byte_perm_S (w[11], w[12], selector); + w[35] = __byte_perm_S (w[10], w[11], selector); + w[34] = __byte_perm_S (w[ 9], w[10], selector); + w[33] = __byte_perm_S (w[ 8], w[ 9], selector); + w[32] = __byte_perm_S (w[ 7], w[ 8], selector); + w[31] = __byte_perm_S (w[ 6], w[ 7], selector); + w[30] = __byte_perm_S (w[ 5], w[ 6], selector); + w[29] = __byte_perm_S (w[ 4], w[ 5], selector); + w[28] = __byte_perm_S (w[ 3], w[ 4], selector); + w[27] = __byte_perm_S (w[ 2], w[ 3], selector); + w[26] = __byte_perm_S (w[ 1], w[ 2], selector); + w[25] = __byte_perm_S (w[ 0], w[ 1], selector); + w[24] = __byte_perm_S ( 0, w[ 0], selector); + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 25: + w[63] = __byte_perm_S (w[37], w[38], selector); + w[62] = __byte_perm_S (w[36], w[37], selector); + w[61] = __byte_perm_S (w[35], w[36], selector); + w[60] = __byte_perm_S (w[34], w[35], selector); + w[59] = __byte_perm_S (w[33], w[34], selector); + w[58] = __byte_perm_S (w[32], w[33], selector); + w[57] = __byte_perm_S (w[31], w[32], selector); + w[56] = __byte_perm_S (w[30], w[31], selector); + w[55] = __byte_perm_S (w[29], w[30], selector); + w[54] = __byte_perm_S (w[28], w[29], selector); + w[53] = __byte_perm_S (w[27], w[28], selector); + w[52] = __byte_perm_S (w[26], w[27], selector); + w[51] = __byte_perm_S (w[25], w[26], selector); + w[50] = __byte_perm_S (w[24], w[25], selector); + w[49] = __byte_perm_S (w[23], w[24], selector); + w[48] = __byte_perm_S (w[22], w[23], selector); + w[47] = __byte_perm_S (w[21], w[22], selector); + w[46] = __byte_perm_S (w[20], w[21], selector); + w[45] = __byte_perm_S (w[19], w[20], selector); + w[44] = __byte_perm_S (w[18], w[19], selector); + w[43] = __byte_perm_S (w[17], w[18], selector); + w[42] = __byte_perm_S (w[16], w[17], selector); + w[41] = __byte_perm_S (w[15], w[16], selector); + w[40] = __byte_perm_S (w[14], w[15], selector); + w[39] = __byte_perm_S (w[13], w[14], selector); + w[38] = __byte_perm_S (w[12], w[13], selector); + w[37] = __byte_perm_S (w[11], w[12], selector); + w[36] = __byte_perm_S (w[10], w[11], selector); + w[35] = __byte_perm_S (w[ 9], w[10], selector); + w[34] = __byte_perm_S (w[ 8], w[ 9], selector); + w[33] = __byte_perm_S (w[ 7], w[ 8], selector); + w[32] = __byte_perm_S (w[ 6], w[ 7], selector); + w[31] = __byte_perm_S (w[ 5], w[ 6], selector); + w[30] = __byte_perm_S (w[ 4], w[ 5], selector); + w[29] = __byte_perm_S (w[ 3], w[ 4], selector); + w[28] = __byte_perm_S (w[ 2], w[ 3], selector); + w[27] = __byte_perm_S (w[ 1], w[ 2], selector); + w[26] = __byte_perm_S (w[ 0], w[ 1], selector); + w[25] = __byte_perm_S ( 0, w[ 0], selector); + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 26: + w[63] = __byte_perm_S (w[36], w[37], selector); + w[62] = __byte_perm_S (w[35], w[36], selector); + w[61] = __byte_perm_S (w[34], w[35], selector); + w[60] = __byte_perm_S (w[33], w[34], selector); + w[59] = __byte_perm_S (w[32], w[33], selector); + w[58] = __byte_perm_S (w[31], w[32], selector); + w[57] = __byte_perm_S (w[30], w[31], selector); + w[56] = __byte_perm_S (w[29], w[30], selector); + w[55] = __byte_perm_S (w[28], w[29], selector); + w[54] = __byte_perm_S (w[27], w[28], selector); + w[53] = __byte_perm_S (w[26], w[27], selector); + w[52] = __byte_perm_S (w[25], w[26], selector); + w[51] = __byte_perm_S (w[24], w[25], selector); + w[50] = __byte_perm_S (w[23], w[24], selector); + w[49] = __byte_perm_S (w[22], w[23], selector); + w[48] = __byte_perm_S (w[21], w[22], selector); + w[47] = __byte_perm_S (w[20], w[21], selector); + w[46] = __byte_perm_S (w[19], w[20], selector); + w[45] = __byte_perm_S (w[18], w[19], selector); + w[44] = __byte_perm_S (w[17], w[18], selector); + w[43] = __byte_perm_S (w[16], w[17], selector); + w[42] = __byte_perm_S (w[15], w[16], selector); + w[41] = __byte_perm_S (w[14], w[15], selector); + w[40] = __byte_perm_S (w[13], w[14], selector); + w[39] = __byte_perm_S (w[12], w[13], selector); + w[38] = __byte_perm_S (w[11], w[12], selector); + w[37] = __byte_perm_S (w[10], w[11], selector); + w[36] = __byte_perm_S (w[ 9], w[10], selector); + w[35] = __byte_perm_S (w[ 8], w[ 9], selector); + w[34] = __byte_perm_S (w[ 7], w[ 8], selector); + w[33] = __byte_perm_S (w[ 6], w[ 7], selector); + w[32] = __byte_perm_S (w[ 5], w[ 6], selector); + w[31] = __byte_perm_S (w[ 4], w[ 5], selector); + w[30] = __byte_perm_S (w[ 3], w[ 4], selector); + w[29] = __byte_perm_S (w[ 2], w[ 3], selector); + w[28] = __byte_perm_S (w[ 1], w[ 2], selector); + w[27] = __byte_perm_S (w[ 0], w[ 1], selector); + w[26] = __byte_perm_S ( 0, w[ 0], selector); + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 27: + w[63] = __byte_perm_S (w[35], w[36], selector); + w[62] = __byte_perm_S (w[34], w[35], selector); + w[61] = __byte_perm_S (w[33], w[34], selector); + w[60] = __byte_perm_S (w[32], w[33], selector); + w[59] = __byte_perm_S (w[31], w[32], selector); + w[58] = __byte_perm_S (w[30], w[31], selector); + w[57] = __byte_perm_S (w[29], w[30], selector); + w[56] = __byte_perm_S (w[28], w[29], selector); + w[55] = __byte_perm_S (w[27], w[28], selector); + w[54] = __byte_perm_S (w[26], w[27], selector); + w[53] = __byte_perm_S (w[25], w[26], selector); + w[52] = __byte_perm_S (w[24], w[25], selector); + w[51] = __byte_perm_S (w[23], w[24], selector); + w[50] = __byte_perm_S (w[22], w[23], selector); + w[49] = __byte_perm_S (w[21], w[22], selector); + w[48] = __byte_perm_S (w[20], w[21], selector); + w[47] = __byte_perm_S (w[19], w[20], selector); + w[46] = __byte_perm_S (w[18], w[19], selector); + w[45] = __byte_perm_S (w[17], w[18], selector); + w[44] = __byte_perm_S (w[16], w[17], selector); + w[43] = __byte_perm_S (w[15], w[16], selector); + w[42] = __byte_perm_S (w[14], w[15], selector); + w[41] = __byte_perm_S (w[13], w[14], selector); + w[40] = __byte_perm_S (w[12], w[13], selector); + w[39] = __byte_perm_S (w[11], w[12], selector); + w[38] = __byte_perm_S (w[10], w[11], selector); + w[37] = __byte_perm_S (w[ 9], w[10], selector); + w[36] = __byte_perm_S (w[ 8], w[ 9], selector); + w[35] = __byte_perm_S (w[ 7], w[ 8], selector); + w[34] = __byte_perm_S (w[ 6], w[ 7], selector); + w[33] = __byte_perm_S (w[ 5], w[ 6], selector); + w[32] = __byte_perm_S (w[ 4], w[ 5], selector); + w[31] = __byte_perm_S (w[ 3], w[ 4], selector); + w[30] = __byte_perm_S (w[ 2], w[ 3], selector); + w[29] = __byte_perm_S (w[ 1], w[ 2], selector); + w[28] = __byte_perm_S (w[ 0], w[ 1], selector); + w[27] = __byte_perm_S ( 0, w[ 0], selector); + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 28: + w[63] = __byte_perm_S (w[34], w[35], selector); + w[62] = __byte_perm_S (w[33], w[34], selector); + w[61] = __byte_perm_S (w[32], w[33], selector); + w[60] = __byte_perm_S (w[31], w[32], selector); + w[59] = __byte_perm_S (w[30], w[31], selector); + w[58] = __byte_perm_S (w[29], w[30], selector); + w[57] = __byte_perm_S (w[28], w[29], selector); + w[56] = __byte_perm_S (w[27], w[28], selector); + w[55] = __byte_perm_S (w[26], w[27], selector); + w[54] = __byte_perm_S (w[25], w[26], selector); + w[53] = __byte_perm_S (w[24], w[25], selector); + w[52] = __byte_perm_S (w[23], w[24], selector); + w[51] = __byte_perm_S (w[22], w[23], selector); + w[50] = __byte_perm_S (w[21], w[22], selector); + w[49] = __byte_perm_S (w[20], w[21], selector); + w[48] = __byte_perm_S (w[19], w[20], selector); + w[47] = __byte_perm_S (w[18], w[19], selector); + w[46] = __byte_perm_S (w[17], w[18], selector); + w[45] = __byte_perm_S (w[16], w[17], selector); + w[44] = __byte_perm_S (w[15], w[16], selector); + w[43] = __byte_perm_S (w[14], w[15], selector); + w[42] = __byte_perm_S (w[13], w[14], selector); + w[41] = __byte_perm_S (w[12], w[13], selector); + w[40] = __byte_perm_S (w[11], w[12], selector); + w[39] = __byte_perm_S (w[10], w[11], selector); + w[38] = __byte_perm_S (w[ 9], w[10], selector); + w[37] = __byte_perm_S (w[ 8], w[ 9], selector); + w[36] = __byte_perm_S (w[ 7], w[ 8], selector); + w[35] = __byte_perm_S (w[ 6], w[ 7], selector); + w[34] = __byte_perm_S (w[ 5], w[ 6], selector); + w[33] = __byte_perm_S (w[ 4], w[ 5], selector); + w[32] = __byte_perm_S (w[ 3], w[ 4], selector); + w[31] = __byte_perm_S (w[ 2], w[ 3], selector); + w[30] = __byte_perm_S (w[ 1], w[ 2], selector); + w[29] = __byte_perm_S (w[ 0], w[ 1], selector); + w[28] = __byte_perm_S ( 0, w[ 0], selector); + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 29: + w[63] = __byte_perm_S (w[33], w[34], selector); + w[62] = __byte_perm_S (w[32], w[33], selector); + w[61] = __byte_perm_S (w[31], w[32], selector); + w[60] = __byte_perm_S (w[30], w[31], selector); + w[59] = __byte_perm_S (w[29], w[30], selector); + w[58] = __byte_perm_S (w[28], w[29], selector); + w[57] = __byte_perm_S (w[27], w[28], selector); + w[56] = __byte_perm_S (w[26], w[27], selector); + w[55] = __byte_perm_S (w[25], w[26], selector); + w[54] = __byte_perm_S (w[24], w[25], selector); + w[53] = __byte_perm_S (w[23], w[24], selector); + w[52] = __byte_perm_S (w[22], w[23], selector); + w[51] = __byte_perm_S (w[21], w[22], selector); + w[50] = __byte_perm_S (w[20], w[21], selector); + w[49] = __byte_perm_S (w[19], w[20], selector); + w[48] = __byte_perm_S (w[18], w[19], selector); + w[47] = __byte_perm_S (w[17], w[18], selector); + w[46] = __byte_perm_S (w[16], w[17], selector); + w[45] = __byte_perm_S (w[15], w[16], selector); + w[44] = __byte_perm_S (w[14], w[15], selector); + w[43] = __byte_perm_S (w[13], w[14], selector); + w[42] = __byte_perm_S (w[12], w[13], selector); + w[41] = __byte_perm_S (w[11], w[12], selector); + w[40] = __byte_perm_S (w[10], w[11], selector); + w[39] = __byte_perm_S (w[ 9], w[10], selector); + w[38] = __byte_perm_S (w[ 8], w[ 9], selector); + w[37] = __byte_perm_S (w[ 7], w[ 8], selector); + w[36] = __byte_perm_S (w[ 6], w[ 7], selector); + w[35] = __byte_perm_S (w[ 5], w[ 6], selector); + w[34] = __byte_perm_S (w[ 4], w[ 5], selector); + w[33] = __byte_perm_S (w[ 3], w[ 4], selector); + w[32] = __byte_perm_S (w[ 2], w[ 3], selector); + w[31] = __byte_perm_S (w[ 1], w[ 2], selector); + w[30] = __byte_perm_S (w[ 0], w[ 1], selector); + w[29] = __byte_perm_S ( 0, w[ 0], selector); + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 30: + w[63] = __byte_perm_S (w[32], w[33], selector); + w[62] = __byte_perm_S (w[31], w[32], selector); + w[61] = __byte_perm_S (w[30], w[31], selector); + w[60] = __byte_perm_S (w[29], w[30], selector); + w[59] = __byte_perm_S (w[28], w[29], selector); + w[58] = __byte_perm_S (w[27], w[28], selector); + w[57] = __byte_perm_S (w[26], w[27], selector); + w[56] = __byte_perm_S (w[25], w[26], selector); + w[55] = __byte_perm_S (w[24], w[25], selector); + w[54] = __byte_perm_S (w[23], w[24], selector); + w[53] = __byte_perm_S (w[22], w[23], selector); + w[52] = __byte_perm_S (w[21], w[22], selector); + w[51] = __byte_perm_S (w[20], w[21], selector); + w[50] = __byte_perm_S (w[19], w[20], selector); + w[49] = __byte_perm_S (w[18], w[19], selector); + w[48] = __byte_perm_S (w[17], w[18], selector); + w[47] = __byte_perm_S (w[16], w[17], selector); + w[46] = __byte_perm_S (w[15], w[16], selector); + w[45] = __byte_perm_S (w[14], w[15], selector); + w[44] = __byte_perm_S (w[13], w[14], selector); + w[43] = __byte_perm_S (w[12], w[13], selector); + w[42] = __byte_perm_S (w[11], w[12], selector); + w[41] = __byte_perm_S (w[10], w[11], selector); + w[40] = __byte_perm_S (w[ 9], w[10], selector); + w[39] = __byte_perm_S (w[ 8], w[ 9], selector); + w[38] = __byte_perm_S (w[ 7], w[ 8], selector); + w[37] = __byte_perm_S (w[ 6], w[ 7], selector); + w[36] = __byte_perm_S (w[ 5], w[ 6], selector); + w[35] = __byte_perm_S (w[ 4], w[ 5], selector); + w[34] = __byte_perm_S (w[ 3], w[ 4], selector); + w[33] = __byte_perm_S (w[ 2], w[ 3], selector); + w[32] = __byte_perm_S (w[ 1], w[ 2], selector); + w[31] = __byte_perm_S (w[ 0], w[ 1], selector); + w[30] = __byte_perm_S ( 0, w[ 0], selector); + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 31: + w[63] = __byte_perm_S (w[31], w[32], selector); + w[62] = __byte_perm_S (w[30], w[31], selector); + w[61] = __byte_perm_S (w[29], w[30], selector); + w[60] = __byte_perm_S (w[28], w[29], selector); + w[59] = __byte_perm_S (w[27], w[28], selector); + w[58] = __byte_perm_S (w[26], w[27], selector); + w[57] = __byte_perm_S (w[25], w[26], selector); + w[56] = __byte_perm_S (w[24], w[25], selector); + w[55] = __byte_perm_S (w[23], w[24], selector); + w[54] = __byte_perm_S (w[22], w[23], selector); + w[53] = __byte_perm_S (w[21], w[22], selector); + w[52] = __byte_perm_S (w[20], w[21], selector); + w[51] = __byte_perm_S (w[19], w[20], selector); + w[50] = __byte_perm_S (w[18], w[19], selector); + w[49] = __byte_perm_S (w[17], w[18], selector); + w[48] = __byte_perm_S (w[16], w[17], selector); + w[47] = __byte_perm_S (w[15], w[16], selector); + w[46] = __byte_perm_S (w[14], w[15], selector); + w[45] = __byte_perm_S (w[13], w[14], selector); + w[44] = __byte_perm_S (w[12], w[13], selector); + w[43] = __byte_perm_S (w[11], w[12], selector); + w[42] = __byte_perm_S (w[10], w[11], selector); + w[41] = __byte_perm_S (w[ 9], w[10], selector); + w[40] = __byte_perm_S (w[ 8], w[ 9], selector); + w[39] = __byte_perm_S (w[ 7], w[ 8], selector); + w[38] = __byte_perm_S (w[ 6], w[ 7], selector); + w[37] = __byte_perm_S (w[ 5], w[ 6], selector); + w[36] = __byte_perm_S (w[ 4], w[ 5], selector); + w[35] = __byte_perm_S (w[ 3], w[ 4], selector); + w[34] = __byte_perm_S (w[ 2], w[ 3], selector); + w[33] = __byte_perm_S (w[ 1], w[ 2], selector); + w[32] = __byte_perm_S (w[ 0], w[ 1], selector); + w[31] = __byte_perm_S ( 0, w[ 0], selector); + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 32: + w[63] = __byte_perm_S (w[30], w[31], selector); + w[62] = __byte_perm_S (w[29], w[30], selector); + w[61] = __byte_perm_S (w[28], w[29], selector); + w[60] = __byte_perm_S (w[27], w[28], selector); + w[59] = __byte_perm_S (w[26], w[27], selector); + w[58] = __byte_perm_S (w[25], w[26], selector); + w[57] = __byte_perm_S (w[24], w[25], selector); + w[56] = __byte_perm_S (w[23], w[24], selector); + w[55] = __byte_perm_S (w[22], w[23], selector); + w[54] = __byte_perm_S (w[21], w[22], selector); + w[53] = __byte_perm_S (w[20], w[21], selector); + w[52] = __byte_perm_S (w[19], w[20], selector); + w[51] = __byte_perm_S (w[18], w[19], selector); + w[50] = __byte_perm_S (w[17], w[18], selector); + w[49] = __byte_perm_S (w[16], w[17], selector); + w[48] = __byte_perm_S (w[15], w[16], selector); + w[47] = __byte_perm_S (w[14], w[15], selector); + w[46] = __byte_perm_S (w[13], w[14], selector); + w[45] = __byte_perm_S (w[12], w[13], selector); + w[44] = __byte_perm_S (w[11], w[12], selector); + w[43] = __byte_perm_S (w[10], w[11], selector); + w[42] = __byte_perm_S (w[ 9], w[10], selector); + w[41] = __byte_perm_S (w[ 8], w[ 9], selector); + w[40] = __byte_perm_S (w[ 7], w[ 8], selector); + w[39] = __byte_perm_S (w[ 6], w[ 7], selector); + w[38] = __byte_perm_S (w[ 5], w[ 6], selector); + w[37] = __byte_perm_S (w[ 4], w[ 5], selector); + w[36] = __byte_perm_S (w[ 3], w[ 4], selector); + w[35] = __byte_perm_S (w[ 2], w[ 3], selector); + w[34] = __byte_perm_S (w[ 1], w[ 2], selector); + w[33] = __byte_perm_S (w[ 0], w[ 1], selector); + w[32] = __byte_perm_S ( 0, w[ 0], selector); + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 33: + w[63] = __byte_perm_S (w[29], w[30], selector); + w[62] = __byte_perm_S (w[28], w[29], selector); + w[61] = __byte_perm_S (w[27], w[28], selector); + w[60] = __byte_perm_S (w[26], w[27], selector); + w[59] = __byte_perm_S (w[25], w[26], selector); + w[58] = __byte_perm_S (w[24], w[25], selector); + w[57] = __byte_perm_S (w[23], w[24], selector); + w[56] = __byte_perm_S (w[22], w[23], selector); + w[55] = __byte_perm_S (w[21], w[22], selector); + w[54] = __byte_perm_S (w[20], w[21], selector); + w[53] = __byte_perm_S (w[19], w[20], selector); + w[52] = __byte_perm_S (w[18], w[19], selector); + w[51] = __byte_perm_S (w[17], w[18], selector); + w[50] = __byte_perm_S (w[16], w[17], selector); + w[49] = __byte_perm_S (w[15], w[16], selector); + w[48] = __byte_perm_S (w[14], w[15], selector); + w[47] = __byte_perm_S (w[13], w[14], selector); + w[46] = __byte_perm_S (w[12], w[13], selector); + w[45] = __byte_perm_S (w[11], w[12], selector); + w[44] = __byte_perm_S (w[10], w[11], selector); + w[43] = __byte_perm_S (w[ 9], w[10], selector); + w[42] = __byte_perm_S (w[ 8], w[ 9], selector); + w[41] = __byte_perm_S (w[ 7], w[ 8], selector); + w[40] = __byte_perm_S (w[ 6], w[ 7], selector); + w[39] = __byte_perm_S (w[ 5], w[ 6], selector); + w[38] = __byte_perm_S (w[ 4], w[ 5], selector); + w[37] = __byte_perm_S (w[ 3], w[ 4], selector); + w[36] = __byte_perm_S (w[ 2], w[ 3], selector); + w[35] = __byte_perm_S (w[ 1], w[ 2], selector); + w[34] = __byte_perm_S (w[ 0], w[ 1], selector); + w[33] = __byte_perm_S ( 0, w[ 0], selector); + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 34: + w[63] = __byte_perm_S (w[28], w[29], selector); + w[62] = __byte_perm_S (w[27], w[28], selector); + w[61] = __byte_perm_S (w[26], w[27], selector); + w[60] = __byte_perm_S (w[25], w[26], selector); + w[59] = __byte_perm_S (w[24], w[25], selector); + w[58] = __byte_perm_S (w[23], w[24], selector); + w[57] = __byte_perm_S (w[22], w[23], selector); + w[56] = __byte_perm_S (w[21], w[22], selector); + w[55] = __byte_perm_S (w[20], w[21], selector); + w[54] = __byte_perm_S (w[19], w[20], selector); + w[53] = __byte_perm_S (w[18], w[19], selector); + w[52] = __byte_perm_S (w[17], w[18], selector); + w[51] = __byte_perm_S (w[16], w[17], selector); + w[50] = __byte_perm_S (w[15], w[16], selector); + w[49] = __byte_perm_S (w[14], w[15], selector); + w[48] = __byte_perm_S (w[13], w[14], selector); + w[47] = __byte_perm_S (w[12], w[13], selector); + w[46] = __byte_perm_S (w[11], w[12], selector); + w[45] = __byte_perm_S (w[10], w[11], selector); + w[44] = __byte_perm_S (w[ 9], w[10], selector); + w[43] = __byte_perm_S (w[ 8], w[ 9], selector); + w[42] = __byte_perm_S (w[ 7], w[ 8], selector); + w[41] = __byte_perm_S (w[ 6], w[ 7], selector); + w[40] = __byte_perm_S (w[ 5], w[ 6], selector); + w[39] = __byte_perm_S (w[ 4], w[ 5], selector); + w[38] = __byte_perm_S (w[ 3], w[ 4], selector); + w[37] = __byte_perm_S (w[ 2], w[ 3], selector); + w[36] = __byte_perm_S (w[ 1], w[ 2], selector); + w[35] = __byte_perm_S (w[ 0], w[ 1], selector); + w[34] = __byte_perm_S ( 0, w[ 0], selector); + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 35: + w[63] = __byte_perm_S (w[27], w[28], selector); + w[62] = __byte_perm_S (w[26], w[27], selector); + w[61] = __byte_perm_S (w[25], w[26], selector); + w[60] = __byte_perm_S (w[24], w[25], selector); + w[59] = __byte_perm_S (w[23], w[24], selector); + w[58] = __byte_perm_S (w[22], w[23], selector); + w[57] = __byte_perm_S (w[21], w[22], selector); + w[56] = __byte_perm_S (w[20], w[21], selector); + w[55] = __byte_perm_S (w[19], w[20], selector); + w[54] = __byte_perm_S (w[18], w[19], selector); + w[53] = __byte_perm_S (w[17], w[18], selector); + w[52] = __byte_perm_S (w[16], w[17], selector); + w[51] = __byte_perm_S (w[15], w[16], selector); + w[50] = __byte_perm_S (w[14], w[15], selector); + w[49] = __byte_perm_S (w[13], w[14], selector); + w[48] = __byte_perm_S (w[12], w[13], selector); + w[47] = __byte_perm_S (w[11], w[12], selector); + w[46] = __byte_perm_S (w[10], w[11], selector); + w[45] = __byte_perm_S (w[ 9], w[10], selector); + w[44] = __byte_perm_S (w[ 8], w[ 9], selector); + w[43] = __byte_perm_S (w[ 7], w[ 8], selector); + w[42] = __byte_perm_S (w[ 6], w[ 7], selector); + w[41] = __byte_perm_S (w[ 5], w[ 6], selector); + w[40] = __byte_perm_S (w[ 4], w[ 5], selector); + w[39] = __byte_perm_S (w[ 3], w[ 4], selector); + w[38] = __byte_perm_S (w[ 2], w[ 3], selector); + w[37] = __byte_perm_S (w[ 1], w[ 2], selector); + w[36] = __byte_perm_S (w[ 0], w[ 1], selector); + w[35] = __byte_perm_S ( 0, w[ 0], selector); + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 36: + w[63] = __byte_perm_S (w[26], w[27], selector); + w[62] = __byte_perm_S (w[25], w[26], selector); + w[61] = __byte_perm_S (w[24], w[25], selector); + w[60] = __byte_perm_S (w[23], w[24], selector); + w[59] = __byte_perm_S (w[22], w[23], selector); + w[58] = __byte_perm_S (w[21], w[22], selector); + w[57] = __byte_perm_S (w[20], w[21], selector); + w[56] = __byte_perm_S (w[19], w[20], selector); + w[55] = __byte_perm_S (w[18], w[19], selector); + w[54] = __byte_perm_S (w[17], w[18], selector); + w[53] = __byte_perm_S (w[16], w[17], selector); + w[52] = __byte_perm_S (w[15], w[16], selector); + w[51] = __byte_perm_S (w[14], w[15], selector); + w[50] = __byte_perm_S (w[13], w[14], selector); + w[49] = __byte_perm_S (w[12], w[13], selector); + w[48] = __byte_perm_S (w[11], w[12], selector); + w[47] = __byte_perm_S (w[10], w[11], selector); + w[46] = __byte_perm_S (w[ 9], w[10], selector); + w[45] = __byte_perm_S (w[ 8], w[ 9], selector); + w[44] = __byte_perm_S (w[ 7], w[ 8], selector); + w[43] = __byte_perm_S (w[ 6], w[ 7], selector); + w[42] = __byte_perm_S (w[ 5], w[ 6], selector); + w[41] = __byte_perm_S (w[ 4], w[ 5], selector); + w[40] = __byte_perm_S (w[ 3], w[ 4], selector); + w[39] = __byte_perm_S (w[ 2], w[ 3], selector); + w[38] = __byte_perm_S (w[ 1], w[ 2], selector); + w[37] = __byte_perm_S (w[ 0], w[ 1], selector); + w[36] = __byte_perm_S ( 0, w[ 0], selector); + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 37: + w[63] = __byte_perm_S (w[25], w[26], selector); + w[62] = __byte_perm_S (w[24], w[25], selector); + w[61] = __byte_perm_S (w[23], w[24], selector); + w[60] = __byte_perm_S (w[22], w[23], selector); + w[59] = __byte_perm_S (w[21], w[22], selector); + w[58] = __byte_perm_S (w[20], w[21], selector); + w[57] = __byte_perm_S (w[19], w[20], selector); + w[56] = __byte_perm_S (w[18], w[19], selector); + w[55] = __byte_perm_S (w[17], w[18], selector); + w[54] = __byte_perm_S (w[16], w[17], selector); + w[53] = __byte_perm_S (w[15], w[16], selector); + w[52] = __byte_perm_S (w[14], w[15], selector); + w[51] = __byte_perm_S (w[13], w[14], selector); + w[50] = __byte_perm_S (w[12], w[13], selector); + w[49] = __byte_perm_S (w[11], w[12], selector); + w[48] = __byte_perm_S (w[10], w[11], selector); + w[47] = __byte_perm_S (w[ 9], w[10], selector); + w[46] = __byte_perm_S (w[ 8], w[ 9], selector); + w[45] = __byte_perm_S (w[ 7], w[ 8], selector); + w[44] = __byte_perm_S (w[ 6], w[ 7], selector); + w[43] = __byte_perm_S (w[ 5], w[ 6], selector); + w[42] = __byte_perm_S (w[ 4], w[ 5], selector); + w[41] = __byte_perm_S (w[ 3], w[ 4], selector); + w[40] = __byte_perm_S (w[ 2], w[ 3], selector); + w[39] = __byte_perm_S (w[ 1], w[ 2], selector); + w[38] = __byte_perm_S (w[ 0], w[ 1], selector); + w[37] = __byte_perm_S ( 0, w[ 0], selector); + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 38: + w[63] = __byte_perm_S (w[24], w[25], selector); + w[62] = __byte_perm_S (w[23], w[24], selector); + w[61] = __byte_perm_S (w[22], w[23], selector); + w[60] = __byte_perm_S (w[21], w[22], selector); + w[59] = __byte_perm_S (w[20], w[21], selector); + w[58] = __byte_perm_S (w[19], w[20], selector); + w[57] = __byte_perm_S (w[18], w[19], selector); + w[56] = __byte_perm_S (w[17], w[18], selector); + w[55] = __byte_perm_S (w[16], w[17], selector); + w[54] = __byte_perm_S (w[15], w[16], selector); + w[53] = __byte_perm_S (w[14], w[15], selector); + w[52] = __byte_perm_S (w[13], w[14], selector); + w[51] = __byte_perm_S (w[12], w[13], selector); + w[50] = __byte_perm_S (w[11], w[12], selector); + w[49] = __byte_perm_S (w[10], w[11], selector); + w[48] = __byte_perm_S (w[ 9], w[10], selector); + w[47] = __byte_perm_S (w[ 8], w[ 9], selector); + w[46] = __byte_perm_S (w[ 7], w[ 8], selector); + w[45] = __byte_perm_S (w[ 6], w[ 7], selector); + w[44] = __byte_perm_S (w[ 5], w[ 6], selector); + w[43] = __byte_perm_S (w[ 4], w[ 5], selector); + w[42] = __byte_perm_S (w[ 3], w[ 4], selector); + w[41] = __byte_perm_S (w[ 2], w[ 3], selector); + w[40] = __byte_perm_S (w[ 1], w[ 2], selector); + w[39] = __byte_perm_S (w[ 0], w[ 1], selector); + w[38] = __byte_perm_S ( 0, w[ 0], selector); + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 39: + w[63] = __byte_perm_S (w[23], w[24], selector); + w[62] = __byte_perm_S (w[22], w[23], selector); + w[61] = __byte_perm_S (w[21], w[22], selector); + w[60] = __byte_perm_S (w[20], w[21], selector); + w[59] = __byte_perm_S (w[19], w[20], selector); + w[58] = __byte_perm_S (w[18], w[19], selector); + w[57] = __byte_perm_S (w[17], w[18], selector); + w[56] = __byte_perm_S (w[16], w[17], selector); + w[55] = __byte_perm_S (w[15], w[16], selector); + w[54] = __byte_perm_S (w[14], w[15], selector); + w[53] = __byte_perm_S (w[13], w[14], selector); + w[52] = __byte_perm_S (w[12], w[13], selector); + w[51] = __byte_perm_S (w[11], w[12], selector); + w[50] = __byte_perm_S (w[10], w[11], selector); + w[49] = __byte_perm_S (w[ 9], w[10], selector); + w[48] = __byte_perm_S (w[ 8], w[ 9], selector); + w[47] = __byte_perm_S (w[ 7], w[ 8], selector); + w[46] = __byte_perm_S (w[ 6], w[ 7], selector); + w[45] = __byte_perm_S (w[ 5], w[ 6], selector); + w[44] = __byte_perm_S (w[ 4], w[ 5], selector); + w[43] = __byte_perm_S (w[ 3], w[ 4], selector); + w[42] = __byte_perm_S (w[ 2], w[ 3], selector); + w[41] = __byte_perm_S (w[ 1], w[ 2], selector); + w[40] = __byte_perm_S (w[ 0], w[ 1], selector); + w[39] = __byte_perm_S ( 0, w[ 0], selector); + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 40: + w[63] = __byte_perm_S (w[22], w[23], selector); + w[62] = __byte_perm_S (w[21], w[22], selector); + w[61] = __byte_perm_S (w[20], w[21], selector); + w[60] = __byte_perm_S (w[19], w[20], selector); + w[59] = __byte_perm_S (w[18], w[19], selector); + w[58] = __byte_perm_S (w[17], w[18], selector); + w[57] = __byte_perm_S (w[16], w[17], selector); + w[56] = __byte_perm_S (w[15], w[16], selector); + w[55] = __byte_perm_S (w[14], w[15], selector); + w[54] = __byte_perm_S (w[13], w[14], selector); + w[53] = __byte_perm_S (w[12], w[13], selector); + w[52] = __byte_perm_S (w[11], w[12], selector); + w[51] = __byte_perm_S (w[10], w[11], selector); + w[50] = __byte_perm_S (w[ 9], w[10], selector); + w[49] = __byte_perm_S (w[ 8], w[ 9], selector); + w[48] = __byte_perm_S (w[ 7], w[ 8], selector); + w[47] = __byte_perm_S (w[ 6], w[ 7], selector); + w[46] = __byte_perm_S (w[ 5], w[ 6], selector); + w[45] = __byte_perm_S (w[ 4], w[ 5], selector); + w[44] = __byte_perm_S (w[ 3], w[ 4], selector); + w[43] = __byte_perm_S (w[ 2], w[ 3], selector); + w[42] = __byte_perm_S (w[ 1], w[ 2], selector); + w[41] = __byte_perm_S (w[ 0], w[ 1], selector); + w[40] = __byte_perm_S ( 0, w[ 0], selector); + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 41: + w[63] = __byte_perm_S (w[21], w[22], selector); + w[62] = __byte_perm_S (w[20], w[21], selector); + w[61] = __byte_perm_S (w[19], w[20], selector); + w[60] = __byte_perm_S (w[18], w[19], selector); + w[59] = __byte_perm_S (w[17], w[18], selector); + w[58] = __byte_perm_S (w[16], w[17], selector); + w[57] = __byte_perm_S (w[15], w[16], selector); + w[56] = __byte_perm_S (w[14], w[15], selector); + w[55] = __byte_perm_S (w[13], w[14], selector); + w[54] = __byte_perm_S (w[12], w[13], selector); + w[53] = __byte_perm_S (w[11], w[12], selector); + w[52] = __byte_perm_S (w[10], w[11], selector); + w[51] = __byte_perm_S (w[ 9], w[10], selector); + w[50] = __byte_perm_S (w[ 8], w[ 9], selector); + w[49] = __byte_perm_S (w[ 7], w[ 8], selector); + w[48] = __byte_perm_S (w[ 6], w[ 7], selector); + w[47] = __byte_perm_S (w[ 5], w[ 6], selector); + w[46] = __byte_perm_S (w[ 4], w[ 5], selector); + w[45] = __byte_perm_S (w[ 3], w[ 4], selector); + w[44] = __byte_perm_S (w[ 2], w[ 3], selector); + w[43] = __byte_perm_S (w[ 1], w[ 2], selector); + w[42] = __byte_perm_S (w[ 0], w[ 1], selector); + w[41] = __byte_perm_S ( 0, w[ 0], selector); + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 42: + w[63] = __byte_perm_S (w[20], w[21], selector); + w[62] = __byte_perm_S (w[19], w[20], selector); + w[61] = __byte_perm_S (w[18], w[19], selector); + w[60] = __byte_perm_S (w[17], w[18], selector); + w[59] = __byte_perm_S (w[16], w[17], selector); + w[58] = __byte_perm_S (w[15], w[16], selector); + w[57] = __byte_perm_S (w[14], w[15], selector); + w[56] = __byte_perm_S (w[13], w[14], selector); + w[55] = __byte_perm_S (w[12], w[13], selector); + w[54] = __byte_perm_S (w[11], w[12], selector); + w[53] = __byte_perm_S (w[10], w[11], selector); + w[52] = __byte_perm_S (w[ 9], w[10], selector); + w[51] = __byte_perm_S (w[ 8], w[ 9], selector); + w[50] = __byte_perm_S (w[ 7], w[ 8], selector); + w[49] = __byte_perm_S (w[ 6], w[ 7], selector); + w[48] = __byte_perm_S (w[ 5], w[ 6], selector); + w[47] = __byte_perm_S (w[ 4], w[ 5], selector); + w[46] = __byte_perm_S (w[ 3], w[ 4], selector); + w[45] = __byte_perm_S (w[ 2], w[ 3], selector); + w[44] = __byte_perm_S (w[ 1], w[ 2], selector); + w[43] = __byte_perm_S (w[ 0], w[ 1], selector); + w[42] = __byte_perm_S ( 0, w[ 0], selector); + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 43: + w[63] = __byte_perm_S (w[19], w[20], selector); + w[62] = __byte_perm_S (w[18], w[19], selector); + w[61] = __byte_perm_S (w[17], w[18], selector); + w[60] = __byte_perm_S (w[16], w[17], selector); + w[59] = __byte_perm_S (w[15], w[16], selector); + w[58] = __byte_perm_S (w[14], w[15], selector); + w[57] = __byte_perm_S (w[13], w[14], selector); + w[56] = __byte_perm_S (w[12], w[13], selector); + w[55] = __byte_perm_S (w[11], w[12], selector); + w[54] = __byte_perm_S (w[10], w[11], selector); + w[53] = __byte_perm_S (w[ 9], w[10], selector); + w[52] = __byte_perm_S (w[ 8], w[ 9], selector); + w[51] = __byte_perm_S (w[ 7], w[ 8], selector); + w[50] = __byte_perm_S (w[ 6], w[ 7], selector); + w[49] = __byte_perm_S (w[ 5], w[ 6], selector); + w[48] = __byte_perm_S (w[ 4], w[ 5], selector); + w[47] = __byte_perm_S (w[ 3], w[ 4], selector); + w[46] = __byte_perm_S (w[ 2], w[ 3], selector); + w[45] = __byte_perm_S (w[ 1], w[ 2], selector); + w[44] = __byte_perm_S (w[ 0], w[ 1], selector); + w[43] = __byte_perm_S ( 0, w[ 0], selector); + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 44: + w[63] = __byte_perm_S (w[18], w[19], selector); + w[62] = __byte_perm_S (w[17], w[18], selector); + w[61] = __byte_perm_S (w[16], w[17], selector); + w[60] = __byte_perm_S (w[15], w[16], selector); + w[59] = __byte_perm_S (w[14], w[15], selector); + w[58] = __byte_perm_S (w[13], w[14], selector); + w[57] = __byte_perm_S (w[12], w[13], selector); + w[56] = __byte_perm_S (w[11], w[12], selector); + w[55] = __byte_perm_S (w[10], w[11], selector); + w[54] = __byte_perm_S (w[ 9], w[10], selector); + w[53] = __byte_perm_S (w[ 8], w[ 9], selector); + w[52] = __byte_perm_S (w[ 7], w[ 8], selector); + w[51] = __byte_perm_S (w[ 6], w[ 7], selector); + w[50] = __byte_perm_S (w[ 5], w[ 6], selector); + w[49] = __byte_perm_S (w[ 4], w[ 5], selector); + w[48] = __byte_perm_S (w[ 3], w[ 4], selector); + w[47] = __byte_perm_S (w[ 2], w[ 3], selector); + w[46] = __byte_perm_S (w[ 1], w[ 2], selector); + w[45] = __byte_perm_S (w[ 0], w[ 1], selector); + w[44] = __byte_perm_S ( 0, w[ 0], selector); + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 45: + w[63] = __byte_perm_S (w[17], w[18], selector); + w[62] = __byte_perm_S (w[16], w[17], selector); + w[61] = __byte_perm_S (w[15], w[16], selector); + w[60] = __byte_perm_S (w[14], w[15], selector); + w[59] = __byte_perm_S (w[13], w[14], selector); + w[58] = __byte_perm_S (w[12], w[13], selector); + w[57] = __byte_perm_S (w[11], w[12], selector); + w[56] = __byte_perm_S (w[10], w[11], selector); + w[55] = __byte_perm_S (w[ 9], w[10], selector); + w[54] = __byte_perm_S (w[ 8], w[ 9], selector); + w[53] = __byte_perm_S (w[ 7], w[ 8], selector); + w[52] = __byte_perm_S (w[ 6], w[ 7], selector); + w[51] = __byte_perm_S (w[ 5], w[ 6], selector); + w[50] = __byte_perm_S (w[ 4], w[ 5], selector); + w[49] = __byte_perm_S (w[ 3], w[ 4], selector); + w[48] = __byte_perm_S (w[ 2], w[ 3], selector); + w[47] = __byte_perm_S (w[ 1], w[ 2], selector); + w[46] = __byte_perm_S (w[ 0], w[ 1], selector); + w[45] = __byte_perm_S ( 0, w[ 0], selector); + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 46: + w[63] = __byte_perm_S (w[16], w[17], selector); + w[62] = __byte_perm_S (w[15], w[16], selector); + w[61] = __byte_perm_S (w[14], w[15], selector); + w[60] = __byte_perm_S (w[13], w[14], selector); + w[59] = __byte_perm_S (w[12], w[13], selector); + w[58] = __byte_perm_S (w[11], w[12], selector); + w[57] = __byte_perm_S (w[10], w[11], selector); + w[56] = __byte_perm_S (w[ 9], w[10], selector); + w[55] = __byte_perm_S (w[ 8], w[ 9], selector); + w[54] = __byte_perm_S (w[ 7], w[ 8], selector); + w[53] = __byte_perm_S (w[ 6], w[ 7], selector); + w[52] = __byte_perm_S (w[ 5], w[ 6], selector); + w[51] = __byte_perm_S (w[ 4], w[ 5], selector); + w[50] = __byte_perm_S (w[ 3], w[ 4], selector); + w[49] = __byte_perm_S (w[ 2], w[ 3], selector); + w[48] = __byte_perm_S (w[ 1], w[ 2], selector); + w[47] = __byte_perm_S (w[ 0], w[ 1], selector); + w[46] = __byte_perm_S ( 0, w[ 0], selector); + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 47: + w[63] = __byte_perm_S (w[15], w[16], selector); + w[62] = __byte_perm_S (w[14], w[15], selector); + w[61] = __byte_perm_S (w[13], w[14], selector); + w[60] = __byte_perm_S (w[12], w[13], selector); + w[59] = __byte_perm_S (w[11], w[12], selector); + w[58] = __byte_perm_S (w[10], w[11], selector); + w[57] = __byte_perm_S (w[ 9], w[10], selector); + w[56] = __byte_perm_S (w[ 8], w[ 9], selector); + w[55] = __byte_perm_S (w[ 7], w[ 8], selector); + w[54] = __byte_perm_S (w[ 6], w[ 7], selector); + w[53] = __byte_perm_S (w[ 5], w[ 6], selector); + w[52] = __byte_perm_S (w[ 4], w[ 5], selector); + w[51] = __byte_perm_S (w[ 3], w[ 4], selector); + w[50] = __byte_perm_S (w[ 2], w[ 3], selector); + w[49] = __byte_perm_S (w[ 1], w[ 2], selector); + w[48] = __byte_perm_S (w[ 0], w[ 1], selector); + w[47] = __byte_perm_S ( 0, w[ 0], selector); + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 48: + w[63] = __byte_perm_S (w[14], w[15], selector); + w[62] = __byte_perm_S (w[13], w[14], selector); + w[61] = __byte_perm_S (w[12], w[13], selector); + w[60] = __byte_perm_S (w[11], w[12], selector); + w[59] = __byte_perm_S (w[10], w[11], selector); + w[58] = __byte_perm_S (w[ 9], w[10], selector); + w[57] = __byte_perm_S (w[ 8], w[ 9], selector); + w[56] = __byte_perm_S (w[ 7], w[ 8], selector); + w[55] = __byte_perm_S (w[ 6], w[ 7], selector); + w[54] = __byte_perm_S (w[ 5], w[ 6], selector); + w[53] = __byte_perm_S (w[ 4], w[ 5], selector); + w[52] = __byte_perm_S (w[ 3], w[ 4], selector); + w[51] = __byte_perm_S (w[ 2], w[ 3], selector); + w[50] = __byte_perm_S (w[ 1], w[ 2], selector); + w[49] = __byte_perm_S (w[ 0], w[ 1], selector); + w[48] = __byte_perm_S ( 0, w[ 0], selector); + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 49: + w[63] = __byte_perm_S (w[13], w[14], selector); + w[62] = __byte_perm_S (w[12], w[13], selector); + w[61] = __byte_perm_S (w[11], w[12], selector); + w[60] = __byte_perm_S (w[10], w[11], selector); + w[59] = __byte_perm_S (w[ 9], w[10], selector); + w[58] = __byte_perm_S (w[ 8], w[ 9], selector); + w[57] = __byte_perm_S (w[ 7], w[ 8], selector); + w[56] = __byte_perm_S (w[ 6], w[ 7], selector); + w[55] = __byte_perm_S (w[ 5], w[ 6], selector); + w[54] = __byte_perm_S (w[ 4], w[ 5], selector); + w[53] = __byte_perm_S (w[ 3], w[ 4], selector); + w[52] = __byte_perm_S (w[ 2], w[ 3], selector); + w[51] = __byte_perm_S (w[ 1], w[ 2], selector); + w[50] = __byte_perm_S (w[ 0], w[ 1], selector); + w[49] = __byte_perm_S ( 0, w[ 0], selector); + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 50: + w[63] = __byte_perm_S (w[12], w[13], selector); + w[62] = __byte_perm_S (w[11], w[12], selector); + w[61] = __byte_perm_S (w[10], w[11], selector); + w[60] = __byte_perm_S (w[ 9], w[10], selector); + w[59] = __byte_perm_S (w[ 8], w[ 9], selector); + w[58] = __byte_perm_S (w[ 7], w[ 8], selector); + w[57] = __byte_perm_S (w[ 6], w[ 7], selector); + w[56] = __byte_perm_S (w[ 5], w[ 6], selector); + w[55] = __byte_perm_S (w[ 4], w[ 5], selector); + w[54] = __byte_perm_S (w[ 3], w[ 4], selector); + w[53] = __byte_perm_S (w[ 2], w[ 3], selector); + w[52] = __byte_perm_S (w[ 1], w[ 2], selector); + w[51] = __byte_perm_S (w[ 0], w[ 1], selector); + w[50] = __byte_perm_S ( 0, w[ 0], selector); + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 51: + w[63] = __byte_perm_S (w[11], w[12], selector); + w[62] = __byte_perm_S (w[10], w[11], selector); + w[61] = __byte_perm_S (w[ 9], w[10], selector); + w[60] = __byte_perm_S (w[ 8], w[ 9], selector); + w[59] = __byte_perm_S (w[ 7], w[ 8], selector); + w[58] = __byte_perm_S (w[ 6], w[ 7], selector); + w[57] = __byte_perm_S (w[ 5], w[ 6], selector); + w[56] = __byte_perm_S (w[ 4], w[ 5], selector); + w[55] = __byte_perm_S (w[ 3], w[ 4], selector); + w[54] = __byte_perm_S (w[ 2], w[ 3], selector); + w[53] = __byte_perm_S (w[ 1], w[ 2], selector); + w[52] = __byte_perm_S (w[ 0], w[ 1], selector); + w[51] = __byte_perm_S ( 0, w[ 0], selector); + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 52: + w[63] = __byte_perm_S (w[10], w[11], selector); + w[62] = __byte_perm_S (w[ 9], w[10], selector); + w[61] = __byte_perm_S (w[ 8], w[ 9], selector); + w[60] = __byte_perm_S (w[ 7], w[ 8], selector); + w[59] = __byte_perm_S (w[ 6], w[ 7], selector); + w[58] = __byte_perm_S (w[ 5], w[ 6], selector); + w[57] = __byte_perm_S (w[ 4], w[ 5], selector); + w[56] = __byte_perm_S (w[ 3], w[ 4], selector); + w[55] = __byte_perm_S (w[ 2], w[ 3], selector); + w[54] = __byte_perm_S (w[ 1], w[ 2], selector); + w[53] = __byte_perm_S (w[ 0], w[ 1], selector); + w[52] = __byte_perm_S ( 0, w[ 0], selector); + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 53: + w[63] = __byte_perm_S (w[ 9], w[10], selector); + w[62] = __byte_perm_S (w[ 8], w[ 9], selector); + w[61] = __byte_perm_S (w[ 7], w[ 8], selector); + w[60] = __byte_perm_S (w[ 6], w[ 7], selector); + w[59] = __byte_perm_S (w[ 5], w[ 6], selector); + w[58] = __byte_perm_S (w[ 4], w[ 5], selector); + w[57] = __byte_perm_S (w[ 3], w[ 4], selector); + w[56] = __byte_perm_S (w[ 2], w[ 3], selector); + w[55] = __byte_perm_S (w[ 1], w[ 2], selector); + w[54] = __byte_perm_S (w[ 0], w[ 1], selector); + w[53] = __byte_perm_S ( 0, w[ 0], selector); + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 54: + w[63] = __byte_perm_S (w[ 8], w[ 9], selector); + w[62] = __byte_perm_S (w[ 7], w[ 8], selector); + w[61] = __byte_perm_S (w[ 6], w[ 7], selector); + w[60] = __byte_perm_S (w[ 5], w[ 6], selector); + w[59] = __byte_perm_S (w[ 4], w[ 5], selector); + w[58] = __byte_perm_S (w[ 3], w[ 4], selector); + w[57] = __byte_perm_S (w[ 2], w[ 3], selector); + w[56] = __byte_perm_S (w[ 1], w[ 2], selector); + w[55] = __byte_perm_S (w[ 0], w[ 1], selector); + w[54] = __byte_perm_S ( 0, w[ 0], selector); + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 55: + w[63] = __byte_perm_S (w[ 7], w[ 8], selector); + w[62] = __byte_perm_S (w[ 6], w[ 7], selector); + w[61] = __byte_perm_S (w[ 5], w[ 6], selector); + w[60] = __byte_perm_S (w[ 4], w[ 5], selector); + w[59] = __byte_perm_S (w[ 3], w[ 4], selector); + w[58] = __byte_perm_S (w[ 2], w[ 3], selector); + w[57] = __byte_perm_S (w[ 1], w[ 2], selector); + w[56] = __byte_perm_S (w[ 0], w[ 1], selector); + w[55] = __byte_perm_S ( 0, w[ 0], selector); + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 56: + w[63] = __byte_perm_S (w[ 6], w[ 7], selector); + w[62] = __byte_perm_S (w[ 5], w[ 6], selector); + w[61] = __byte_perm_S (w[ 4], w[ 5], selector); + w[60] = __byte_perm_S (w[ 3], w[ 4], selector); + w[59] = __byte_perm_S (w[ 2], w[ 3], selector); + w[58] = __byte_perm_S (w[ 1], w[ 2], selector); + w[57] = __byte_perm_S (w[ 0], w[ 1], selector); + w[56] = __byte_perm_S ( 0, w[ 0], selector); + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 57: + w[63] = __byte_perm_S (w[ 5], w[ 6], selector); + w[62] = __byte_perm_S (w[ 4], w[ 5], selector); + w[61] = __byte_perm_S (w[ 3], w[ 4], selector); + w[60] = __byte_perm_S (w[ 2], w[ 3], selector); + w[59] = __byte_perm_S (w[ 1], w[ 2], selector); + w[58] = __byte_perm_S (w[ 0], w[ 1], selector); + w[57] = __byte_perm_S ( 0, w[ 0], selector); + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 58: + w[63] = __byte_perm_S (w[ 4], w[ 5], selector); + w[62] = __byte_perm_S (w[ 3], w[ 4], selector); + w[61] = __byte_perm_S (w[ 2], w[ 3], selector); + w[60] = __byte_perm_S (w[ 1], w[ 2], selector); + w[59] = __byte_perm_S (w[ 0], w[ 1], selector); + w[58] = __byte_perm_S ( 0, w[ 0], selector); + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 59: + w[63] = __byte_perm_S (w[ 3], w[ 4], selector); + w[62] = __byte_perm_S (w[ 2], w[ 3], selector); + w[61] = __byte_perm_S (w[ 1], w[ 2], selector); + w[60] = __byte_perm_S (w[ 0], w[ 1], selector); + w[59] = __byte_perm_S ( 0, w[ 0], selector); + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 60: + w[63] = __byte_perm_S (w[ 2], w[ 3], selector); + w[62] = __byte_perm_S (w[ 1], w[ 2], selector); + w[61] = __byte_perm_S (w[ 0], w[ 1], selector); + w[60] = __byte_perm_S ( 0, w[ 0], selector); + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 61: + w[63] = __byte_perm_S (w[ 1], w[ 2], selector); + w[62] = __byte_perm_S (w[ 0], w[ 1], selector); + w[61] = __byte_perm_S ( 0, w[ 0], selector); + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 62: + w[63] = __byte_perm_S (w[ 0], w[ 1], selector); + w[62] = __byte_perm_S ( 0, w[ 0], selector); + w[61] = 0; + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + break; + case 63: + w[63] = __byte_perm_S ( 0, w[ 0], selector); + w[62] = 0; + w[61] = 0; + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; break; } #endif @@ -858,122 +8484,31 @@ __kernel void amp (__global pw_t *pws, __global pw_t *pws_amp, __global const ke if (gid >= gid_max) return; - const u32 pw_l_len = pws[gid].pw_len; + pw_t pw = pws[gid]; - u32 wordl0[4]; + comb_t comb = combs_buf[0]; - wordl0[0] = pws[gid].i[ 0]; - wordl0[1] = pws[gid].i[ 1]; - wordl0[2] = pws[gid].i[ 2]; - wordl0[3] = pws[gid].i[ 3]; + const u32 pw_len = pw.pw_len; - u32 wordl1[4]; - - wordl1[0] = pws[gid].i[ 4]; - wordl1[1] = pws[gid].i[ 5]; - wordl1[2] = pws[gid].i[ 6]; - wordl1[3] = pws[gid].i[ 7]; - - u32 wordl2[4]; - - wordl2[0] = 0; - wordl2[1] = 0; - wordl2[2] = 0; - wordl2[3] = 0; - - u32 wordl3[4]; - - wordl3[0] = 0; - wordl3[1] = 0; - wordl3[2] = 0; - wordl3[3] = 0; - - const u32 pw_r_len = combs_buf[0].pw_len; - - u32 wordr0[4]; - - wordr0[0] = combs_buf[0].i[0]; - wordr0[1] = combs_buf[0].i[1]; - wordr0[2] = combs_buf[0].i[2]; - wordr0[3] = combs_buf[0].i[3]; - - u32 wordr1[4]; - - wordr1[0] = combs_buf[0].i[4]; - wordr1[1] = combs_buf[0].i[5]; - wordr1[2] = combs_buf[0].i[6]; - wordr1[3] = combs_buf[0].i[7]; - - u32 wordr2[4]; - - wordr2[0] = 0; - wordr2[1] = 0; - wordr2[2] = 0; - wordr2[3] = 0; - - u32 wordr3[4]; - - wordr3[0] = 0; - wordr3[1] = 0; - wordr3[2] = 0; - wordr3[3] = 0; + const u32 comb_len = comb.pw_len; if (combs_mode == COMBINATOR_MODE_BASE_LEFT) { - switch_buffer_by_offset_le_S (wordr0, wordr1, wordr2, wordr3, pw_l_len); + switch_buffer_by_offset_le_S (comb.i, pw_len); } if (combs_mode == COMBINATOR_MODE_BASE_RIGHT) { - switch_buffer_by_offset_le_S (wordl0, wordl1, wordl2, wordl3, pw_r_len); + switch_buffer_by_offset_le_S (pw.i, comb_len); } - u32 w0[4]; + #pragma unroll + for (int i = 0; i < 64; i++) + { + pw.i[i] |= comb.i[i]; + } - w0[0] = wordl0[0] | wordr0[0]; - w0[1] = wordl0[1] | wordr0[1]; - w0[2] = wordl0[2] | wordr0[2]; - w0[3] = wordl0[3] | wordr0[3]; + pw.pw_len = pw_len + comb_len; - u32 w1[4]; - - w1[0] = wordl1[0] | wordr1[0]; - w1[1] = wordl1[1] | wordr1[1]; - w1[2] = wordl1[2] | wordr1[2]; - w1[3] = wordl1[3] | wordr1[3]; - - u32 w2[4]; - - w2[0] = wordl2[0] | wordr2[0]; - w2[1] = wordl2[1] | wordr2[1]; - w2[2] = wordl2[2] | wordr2[2]; - w2[3] = wordl2[3] | wordr2[3]; - - u32 w3[4]; - - w3[0] = wordl3[0] | wordr3[0]; - w3[1] = wordl3[1] | wordr3[1]; - w3[2] = wordl3[2] | wordr3[2]; - w3[3] = wordl3[3] | wordr3[3]; - - const u32 pw_len = pw_l_len + pw_r_len; - - pws_amp[gid].i[ 0] = w0[0]; - pws_amp[gid].i[ 1] = w0[1]; - pws_amp[gid].i[ 2] = w0[2]; - pws_amp[gid].i[ 3] = w0[3]; - pws_amp[gid].i[ 4] = w1[0]; - pws_amp[gid].i[ 5] = w1[1]; - pws_amp[gid].i[ 6] = w1[2]; - pws_amp[gid].i[ 7] = w1[3]; - pws_amp[gid].i[ 8] = w2[0]; - pws_amp[gid].i[ 9] = w2[1]; - pws_amp[gid].i[10] = w2[2]; - pws_amp[gid].i[11] = w2[3]; - pws_amp[gid].i[12] = w3[0]; - pws_amp[gid].i[13] = w3[1]; - pws_amp[gid].i[14] = w3[2]; - pws_amp[gid].i[15] = w3[3]; - - pws_amp[gid].pw_len = pw_len; + pws_amp[gid] = pw; }