From f03156b05e096f4a88012b0844da10c163204ba5 Mon Sep 17 00:00:00 2001 From: jsteube Date: Thu, 13 Jul 2017 18:46:24 +0200 Subject: [PATCH] Add switch_buffer_by_offset_1x64_be_S() and code generators for later use --- OpenCL/inc_common.cl | 8725 +++++++++++++++++ ...N_AMD_switch_buffer_by_offset_1x64_be_S.pl | 28 + ...EN_NV_switch_buffer_by_offset_1x64_be_S.pl | 29 + 3 files changed, 8782 insertions(+) create mode 100644 tools/code_generators/GEN_AMD_switch_buffer_by_offset_1x64_be_S.pl create mode 100644 tools/code_generators/GEN_NV_switch_buffer_by_offset_1x64_be_S.pl diff --git a/OpenCL/inc_common.cl b/OpenCL/inc_common.cl index fb06c7b86..d6e0501be 100644 --- a/OpenCL/inc_common.cl +++ b/OpenCL/inc_common.cl @@ -36605,6 +36605,8731 @@ inline void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) #endif } +inline void switch_buffer_by_offset_1x64_be_S (u32 w[64], const u32 offset) +{ + #if defined IS_AMD || defined IS_GENERIC + const int offset_mod_4 = offset & 3; + + const int offset_minus_4 = 4 - offset; + + switch (offset / 4) + { + case 0: + w[63] = amd_bytealign_S (w[62], w[63], offset); + w[62] = amd_bytealign_S (w[61], w[62], offset); + w[61] = amd_bytealign_S (w[60], w[61], offset); + w[60] = amd_bytealign_S (w[59], w[60], offset); + w[59] = amd_bytealign_S (w[58], w[59], offset); + w[58] = amd_bytealign_S (w[57], w[58], offset); + w[57] = amd_bytealign_S (w[56], w[57], offset); + w[56] = amd_bytealign_S (w[55], w[56], offset); + w[55] = amd_bytealign_S (w[54], w[55], offset); + w[54] = amd_bytealign_S (w[53], w[54], offset); + w[53] = amd_bytealign_S (w[52], w[53], offset); + w[52] = amd_bytealign_S (w[51], w[52], offset); + w[51] = amd_bytealign_S (w[50], w[51], offset); + w[50] = amd_bytealign_S (w[49], w[50], offset); + w[49] = amd_bytealign_S (w[48], w[49], offset); + w[48] = amd_bytealign_S (w[47], w[48], offset); + w[47] = amd_bytealign_S (w[46], w[47], offset); + w[46] = amd_bytealign_S (w[45], w[46], offset); + w[45] = amd_bytealign_S (w[44], w[45], offset); + w[44] = amd_bytealign_S (w[43], w[44], offset); + w[43] = amd_bytealign_S (w[42], w[43], offset); + w[42] = amd_bytealign_S (w[41], w[42], offset); + w[41] = amd_bytealign_S (w[40], w[41], offset); + w[40] = amd_bytealign_S (w[39], w[40], offset); + w[39] = amd_bytealign_S (w[38], w[39], offset); + w[38] = amd_bytealign_S (w[37], w[38], offset); + w[37] = amd_bytealign_S (w[36], w[37], offset); + w[36] = amd_bytealign_S (w[35], w[36], offset); + w[35] = amd_bytealign_S (w[34], w[35], offset); + w[34] = amd_bytealign_S (w[33], w[34], offset); + w[33] = amd_bytealign_S (w[32], w[33], offset); + w[32] = amd_bytealign_S (w[31], w[32], offset); + w[31] = amd_bytealign_S (w[30], w[31], offset); + w[30] = amd_bytealign_S (w[29], w[30], offset); + w[29] = amd_bytealign_S (w[28], w[29], offset); + w[28] = amd_bytealign_S (w[27], w[28], offset); + w[27] = amd_bytealign_S (w[26], w[27], offset); + w[26] = amd_bytealign_S (w[25], w[26], offset); + w[25] = amd_bytealign_S (w[24], w[25], offset); + w[24] = amd_bytealign_S (w[23], w[24], offset); + w[23] = amd_bytealign_S (w[22], w[23], offset); + w[22] = amd_bytealign_S (w[21], w[22], offset); + w[21] = amd_bytealign_S (w[20], w[21], offset); + w[20] = amd_bytealign_S (w[19], w[20], offset); + w[19] = amd_bytealign_S (w[18], w[19], offset); + w[18] = amd_bytealign_S (w[17], w[18], offset); + w[17] = amd_bytealign_S (w[16], w[17], offset); + w[16] = amd_bytealign_S (w[15], w[16], offset); + w[15] = amd_bytealign_S (w[14], w[15], offset); + w[14] = amd_bytealign_S (w[13], w[14], offset); + w[13] = amd_bytealign_S (w[12], w[13], offset); + w[12] = amd_bytealign_S (w[11], w[12], offset); + w[11] = amd_bytealign_S (w[10], w[11], offset); + w[10] = amd_bytealign_S (w[ 9], w[10], offset); + w[ 9] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[ 8] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[ 7] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[ 6] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[ 5] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[ 4] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[ 3] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[ 2] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[ 1] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[ 0] = amd_bytealign_S ( 0, w[ 0], offset); + + break; + + case 1: + w[63] = amd_bytealign_S (w[61], w[62], offset); + w[62] = amd_bytealign_S (w[60], w[61], offset); + w[61] = amd_bytealign_S (w[59], w[60], offset); + w[60] = amd_bytealign_S (w[58], w[59], offset); + w[59] = amd_bytealign_S (w[57], w[58], offset); + w[58] = amd_bytealign_S (w[56], w[57], offset); + w[57] = amd_bytealign_S (w[55], w[56], offset); + w[56] = amd_bytealign_S (w[54], w[55], offset); + w[55] = amd_bytealign_S (w[53], w[54], offset); + w[54] = amd_bytealign_S (w[52], w[53], offset); + w[53] = amd_bytealign_S (w[51], w[52], offset); + w[52] = amd_bytealign_S (w[50], w[51], offset); + w[51] = amd_bytealign_S (w[49], w[50], offset); + w[50] = amd_bytealign_S (w[48], w[49], offset); + w[49] = amd_bytealign_S (w[47], w[48], offset); + w[48] = amd_bytealign_S (w[46], w[47], offset); + w[47] = amd_bytealign_S (w[45], w[46], offset); + w[46] = amd_bytealign_S (w[44], w[45], offset); + w[45] = amd_bytealign_S (w[43], w[44], offset); + w[44] = amd_bytealign_S (w[42], w[43], offset); + w[43] = amd_bytealign_S (w[41], w[42], offset); + w[42] = amd_bytealign_S (w[40], w[41], offset); + w[41] = amd_bytealign_S (w[39], w[40], offset); + w[40] = amd_bytealign_S (w[38], w[39], offset); + w[39] = amd_bytealign_S (w[37], w[38], offset); + w[38] = amd_bytealign_S (w[36], w[37], offset); + w[37] = amd_bytealign_S (w[35], w[36], offset); + w[36] = amd_bytealign_S (w[34], w[35], offset); + w[35] = amd_bytealign_S (w[33], w[34], offset); + w[34] = amd_bytealign_S (w[32], w[33], offset); + w[33] = amd_bytealign_S (w[31], w[32], offset); + w[32] = amd_bytealign_S (w[30], w[31], offset); + w[31] = amd_bytealign_S (w[29], w[30], offset); + w[30] = amd_bytealign_S (w[28], w[29], offset); + w[29] = amd_bytealign_S (w[27], w[28], offset); + w[28] = amd_bytealign_S (w[26], w[27], offset); + w[27] = amd_bytealign_S (w[25], w[26], offset); + w[26] = amd_bytealign_S (w[24], w[25], offset); + w[25] = amd_bytealign_S (w[23], w[24], offset); + w[24] = amd_bytealign_S (w[22], w[23], offset); + w[23] = amd_bytealign_S (w[21], w[22], offset); + w[22] = amd_bytealign_S (w[20], w[21], offset); + w[21] = amd_bytealign_S (w[19], w[20], offset); + w[20] = amd_bytealign_S (w[18], w[19], offset); + w[19] = amd_bytealign_S (w[17], w[18], offset); + w[18] = amd_bytealign_S (w[16], w[17], offset); + w[17] = amd_bytealign_S (w[15], w[16], offset); + w[16] = amd_bytealign_S (w[14], w[15], offset); + w[15] = amd_bytealign_S (w[13], w[14], offset); + w[14] = amd_bytealign_S (w[12], w[13], offset); + w[13] = amd_bytealign_S (w[11], w[12], offset); + w[12] = amd_bytealign_S (w[10], w[11], offset); + w[11] = amd_bytealign_S (w[ 9], w[10], offset); + w[10] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[ 9] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[ 8] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[ 7] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[ 6] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[ 5] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[ 4] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[ 3] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[ 2] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[ 1] = amd_bytealign_S ( 0, w[ 0], offset); + w[ 0] = 0; + + break; + + case 2: + w[63] = amd_bytealign_S (w[60], w[61], offset); + w[62] = amd_bytealign_S (w[59], w[60], offset); + w[61] = amd_bytealign_S (w[58], w[59], offset); + w[60] = amd_bytealign_S (w[57], w[58], offset); + w[59] = amd_bytealign_S (w[56], w[57], offset); + w[58] = amd_bytealign_S (w[55], w[56], offset); + w[57] = amd_bytealign_S (w[54], w[55], offset); + w[56] = amd_bytealign_S (w[53], w[54], offset); + w[55] = amd_bytealign_S (w[52], w[53], offset); + w[54] = amd_bytealign_S (w[51], w[52], offset); + w[53] = amd_bytealign_S (w[50], w[51], offset); + w[52] = amd_bytealign_S (w[49], w[50], offset); + w[51] = amd_bytealign_S (w[48], w[49], offset); + w[50] = amd_bytealign_S (w[47], w[48], offset); + w[49] = amd_bytealign_S (w[46], w[47], offset); + w[48] = amd_bytealign_S (w[45], w[46], offset); + w[47] = amd_bytealign_S (w[44], w[45], offset); + w[46] = amd_bytealign_S (w[43], w[44], offset); + w[45] = amd_bytealign_S (w[42], w[43], offset); + w[44] = amd_bytealign_S (w[41], w[42], offset); + w[43] = amd_bytealign_S (w[40], w[41], offset); + w[42] = amd_bytealign_S (w[39], w[40], offset); + w[41] = amd_bytealign_S (w[38], w[39], offset); + w[40] = amd_bytealign_S (w[37], w[38], offset); + w[39] = amd_bytealign_S (w[36], w[37], offset); + w[38] = amd_bytealign_S (w[35], w[36], offset); + w[37] = amd_bytealign_S (w[34], w[35], offset); + w[36] = amd_bytealign_S (w[33], w[34], offset); + w[35] = amd_bytealign_S (w[32], w[33], offset); + w[34] = amd_bytealign_S (w[31], w[32], offset); + w[33] = amd_bytealign_S (w[30], w[31], offset); + w[32] = amd_bytealign_S (w[29], w[30], offset); + w[31] = amd_bytealign_S (w[28], w[29], offset); + w[30] = amd_bytealign_S (w[27], w[28], offset); + w[29] = amd_bytealign_S (w[26], w[27], offset); + w[28] = amd_bytealign_S (w[25], w[26], offset); + w[27] = amd_bytealign_S (w[24], w[25], offset); + w[26] = amd_bytealign_S (w[23], w[24], offset); + w[25] = amd_bytealign_S (w[22], w[23], offset); + w[24] = amd_bytealign_S (w[21], w[22], offset); + w[23] = amd_bytealign_S (w[20], w[21], offset); + w[22] = amd_bytealign_S (w[19], w[20], offset); + w[21] = amd_bytealign_S (w[18], w[19], offset); + w[20] = amd_bytealign_S (w[17], w[18], offset); + w[19] = amd_bytealign_S (w[16], w[17], offset); + w[18] = amd_bytealign_S (w[15], w[16], offset); + w[17] = amd_bytealign_S (w[14], w[15], offset); + w[16] = amd_bytealign_S (w[13], w[14], offset); + w[15] = amd_bytealign_S (w[12], w[13], offset); + w[14] = amd_bytealign_S (w[11], w[12], offset); + w[13] = amd_bytealign_S (w[10], w[11], offset); + w[12] = amd_bytealign_S (w[ 9], w[10], offset); + w[11] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[10] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[ 9] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[ 8] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[ 7] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[ 6] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[ 5] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[ 4] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[ 3] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[ 2] = amd_bytealign_S ( 0, w[ 0], offset); + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 3: + w[63] = amd_bytealign_S (w[59], w[60], offset); + w[62] = amd_bytealign_S (w[58], w[59], offset); + w[61] = amd_bytealign_S (w[57], w[58], offset); + w[60] = amd_bytealign_S (w[56], w[57], offset); + w[59] = amd_bytealign_S (w[55], w[56], offset); + w[58] = amd_bytealign_S (w[54], w[55], offset); + w[57] = amd_bytealign_S (w[53], w[54], offset); + w[56] = amd_bytealign_S (w[52], w[53], offset); + w[55] = amd_bytealign_S (w[51], w[52], offset); + w[54] = amd_bytealign_S (w[50], w[51], offset); + w[53] = amd_bytealign_S (w[49], w[50], offset); + w[52] = amd_bytealign_S (w[48], w[49], offset); + w[51] = amd_bytealign_S (w[47], w[48], offset); + w[50] = amd_bytealign_S (w[46], w[47], offset); + w[49] = amd_bytealign_S (w[45], w[46], offset); + w[48] = amd_bytealign_S (w[44], w[45], offset); + w[47] = amd_bytealign_S (w[43], w[44], offset); + w[46] = amd_bytealign_S (w[42], w[43], offset); + w[45] = amd_bytealign_S (w[41], w[42], offset); + w[44] = amd_bytealign_S (w[40], w[41], offset); + w[43] = amd_bytealign_S (w[39], w[40], offset); + w[42] = amd_bytealign_S (w[38], w[39], offset); + w[41] = amd_bytealign_S (w[37], w[38], offset); + w[40] = amd_bytealign_S (w[36], w[37], offset); + w[39] = amd_bytealign_S (w[35], w[36], offset); + w[38] = amd_bytealign_S (w[34], w[35], offset); + w[37] = amd_bytealign_S (w[33], w[34], offset); + w[36] = amd_bytealign_S (w[32], w[33], offset); + w[35] = amd_bytealign_S (w[31], w[32], offset); + w[34] = amd_bytealign_S (w[30], w[31], offset); + w[33] = amd_bytealign_S (w[29], w[30], offset); + w[32] = amd_bytealign_S (w[28], w[29], offset); + w[31] = amd_bytealign_S (w[27], w[28], offset); + w[30] = amd_bytealign_S (w[26], w[27], offset); + w[29] = amd_bytealign_S (w[25], w[26], offset); + w[28] = amd_bytealign_S (w[24], w[25], offset); + w[27] = amd_bytealign_S (w[23], w[24], offset); + w[26] = amd_bytealign_S (w[22], w[23], offset); + w[25] = amd_bytealign_S (w[21], w[22], offset); + w[24] = amd_bytealign_S (w[20], w[21], offset); + w[23] = amd_bytealign_S (w[19], w[20], offset); + w[22] = amd_bytealign_S (w[18], w[19], offset); + w[21] = amd_bytealign_S (w[17], w[18], offset); + w[20] = amd_bytealign_S (w[16], w[17], offset); + w[19] = amd_bytealign_S (w[15], w[16], offset); + w[18] = amd_bytealign_S (w[14], w[15], offset); + w[17] = amd_bytealign_S (w[13], w[14], offset); + w[16] = amd_bytealign_S (w[12], w[13], offset); + w[15] = amd_bytealign_S (w[11], w[12], offset); + w[14] = amd_bytealign_S (w[10], w[11], offset); + w[13] = amd_bytealign_S (w[ 9], w[10], offset); + w[12] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[11] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[10] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[ 9] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[ 8] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[ 7] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[ 6] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[ 5] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[ 4] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[ 3] = amd_bytealign_S ( 0, w[ 0], offset); + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 4: + w[63] = amd_bytealign_S (w[58], w[59], offset); + w[62] = amd_bytealign_S (w[57], w[58], offset); + w[61] = amd_bytealign_S (w[56], w[57], offset); + w[60] = amd_bytealign_S (w[55], w[56], offset); + w[59] = amd_bytealign_S (w[54], w[55], offset); + w[58] = amd_bytealign_S (w[53], w[54], offset); + w[57] = amd_bytealign_S (w[52], w[53], offset); + w[56] = amd_bytealign_S (w[51], w[52], offset); + w[55] = amd_bytealign_S (w[50], w[51], offset); + w[54] = amd_bytealign_S (w[49], w[50], offset); + w[53] = amd_bytealign_S (w[48], w[49], offset); + w[52] = amd_bytealign_S (w[47], w[48], offset); + w[51] = amd_bytealign_S (w[46], w[47], offset); + w[50] = amd_bytealign_S (w[45], w[46], offset); + w[49] = amd_bytealign_S (w[44], w[45], offset); + w[48] = amd_bytealign_S (w[43], w[44], offset); + w[47] = amd_bytealign_S (w[42], w[43], offset); + w[46] = amd_bytealign_S (w[41], w[42], offset); + w[45] = amd_bytealign_S (w[40], w[41], offset); + w[44] = amd_bytealign_S (w[39], w[40], offset); + w[43] = amd_bytealign_S (w[38], w[39], offset); + w[42] = amd_bytealign_S (w[37], w[38], offset); + w[41] = amd_bytealign_S (w[36], w[37], offset); + w[40] = amd_bytealign_S (w[35], w[36], offset); + w[39] = amd_bytealign_S (w[34], w[35], offset); + w[38] = amd_bytealign_S (w[33], w[34], offset); + w[37] = amd_bytealign_S (w[32], w[33], offset); + w[36] = amd_bytealign_S (w[31], w[32], offset); + w[35] = amd_bytealign_S (w[30], w[31], offset); + w[34] = amd_bytealign_S (w[29], w[30], offset); + w[33] = amd_bytealign_S (w[28], w[29], offset); + w[32] = amd_bytealign_S (w[27], w[28], offset); + w[31] = amd_bytealign_S (w[26], w[27], offset); + w[30] = amd_bytealign_S (w[25], w[26], offset); + w[29] = amd_bytealign_S (w[24], w[25], offset); + w[28] = amd_bytealign_S (w[23], w[24], offset); + w[27] = amd_bytealign_S (w[22], w[23], offset); + w[26] = amd_bytealign_S (w[21], w[22], offset); + w[25] = amd_bytealign_S (w[20], w[21], offset); + w[24] = amd_bytealign_S (w[19], w[20], offset); + w[23] = amd_bytealign_S (w[18], w[19], offset); + w[22] = amd_bytealign_S (w[17], w[18], offset); + w[21] = amd_bytealign_S (w[16], w[17], offset); + w[20] = amd_bytealign_S (w[15], w[16], offset); + w[19] = amd_bytealign_S (w[14], w[15], offset); + w[18] = amd_bytealign_S (w[13], w[14], offset); + w[17] = amd_bytealign_S (w[12], w[13], offset); + w[16] = amd_bytealign_S (w[11], w[12], offset); + w[15] = amd_bytealign_S (w[10], w[11], offset); + w[14] = amd_bytealign_S (w[ 9], w[10], offset); + w[13] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[12] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[11] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[10] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[ 9] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[ 8] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[ 7] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[ 6] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[ 5] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[ 4] = amd_bytealign_S ( 0, w[ 0], offset); + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 5: + w[63] = amd_bytealign_S (w[57], w[58], offset); + w[62] = amd_bytealign_S (w[56], w[57], offset); + w[61] = amd_bytealign_S (w[55], w[56], offset); + w[60] = amd_bytealign_S (w[54], w[55], offset); + w[59] = amd_bytealign_S (w[53], w[54], offset); + w[58] = amd_bytealign_S (w[52], w[53], offset); + w[57] = amd_bytealign_S (w[51], w[52], offset); + w[56] = amd_bytealign_S (w[50], w[51], offset); + w[55] = amd_bytealign_S (w[49], w[50], offset); + w[54] = amd_bytealign_S (w[48], w[49], offset); + w[53] = amd_bytealign_S (w[47], w[48], offset); + w[52] = amd_bytealign_S (w[46], w[47], offset); + w[51] = amd_bytealign_S (w[45], w[46], offset); + w[50] = amd_bytealign_S (w[44], w[45], offset); + w[49] = amd_bytealign_S (w[43], w[44], offset); + w[48] = amd_bytealign_S (w[42], w[43], offset); + w[47] = amd_bytealign_S (w[41], w[42], offset); + w[46] = amd_bytealign_S (w[40], w[41], offset); + w[45] = amd_bytealign_S (w[39], w[40], offset); + w[44] = amd_bytealign_S (w[38], w[39], offset); + w[43] = amd_bytealign_S (w[37], w[38], offset); + w[42] = amd_bytealign_S (w[36], w[37], offset); + w[41] = amd_bytealign_S (w[35], w[36], offset); + w[40] = amd_bytealign_S (w[34], w[35], offset); + w[39] = amd_bytealign_S (w[33], w[34], offset); + w[38] = amd_bytealign_S (w[32], w[33], offset); + w[37] = amd_bytealign_S (w[31], w[32], offset); + w[36] = amd_bytealign_S (w[30], w[31], offset); + w[35] = amd_bytealign_S (w[29], w[30], offset); + w[34] = amd_bytealign_S (w[28], w[29], offset); + w[33] = amd_bytealign_S (w[27], w[28], offset); + w[32] = amd_bytealign_S (w[26], w[27], offset); + w[31] = amd_bytealign_S (w[25], w[26], offset); + w[30] = amd_bytealign_S (w[24], w[25], offset); + w[29] = amd_bytealign_S (w[23], w[24], offset); + w[28] = amd_bytealign_S (w[22], w[23], offset); + w[27] = amd_bytealign_S (w[21], w[22], offset); + w[26] = amd_bytealign_S (w[20], w[21], offset); + w[25] = amd_bytealign_S (w[19], w[20], offset); + w[24] = amd_bytealign_S (w[18], w[19], offset); + w[23] = amd_bytealign_S (w[17], w[18], offset); + w[22] = amd_bytealign_S (w[16], w[17], offset); + w[21] = amd_bytealign_S (w[15], w[16], offset); + w[20] = amd_bytealign_S (w[14], w[15], offset); + w[19] = amd_bytealign_S (w[13], w[14], offset); + w[18] = amd_bytealign_S (w[12], w[13], offset); + w[17] = amd_bytealign_S (w[11], w[12], offset); + w[16] = amd_bytealign_S (w[10], w[11], offset); + w[15] = amd_bytealign_S (w[ 9], w[10], offset); + w[14] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[13] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[12] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[11] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[10] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[ 9] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[ 8] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[ 7] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[ 6] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[ 5] = amd_bytealign_S ( 0, w[ 0], offset); + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 6: + w[63] = amd_bytealign_S (w[56], w[57], offset); + w[62] = amd_bytealign_S (w[55], w[56], offset); + w[61] = amd_bytealign_S (w[54], w[55], offset); + w[60] = amd_bytealign_S (w[53], w[54], offset); + w[59] = amd_bytealign_S (w[52], w[53], offset); + w[58] = amd_bytealign_S (w[51], w[52], offset); + w[57] = amd_bytealign_S (w[50], w[51], offset); + w[56] = amd_bytealign_S (w[49], w[50], offset); + w[55] = amd_bytealign_S (w[48], w[49], offset); + w[54] = amd_bytealign_S (w[47], w[48], offset); + w[53] = amd_bytealign_S (w[46], w[47], offset); + w[52] = amd_bytealign_S (w[45], w[46], offset); + w[51] = amd_bytealign_S (w[44], w[45], offset); + w[50] = amd_bytealign_S (w[43], w[44], offset); + w[49] = amd_bytealign_S (w[42], w[43], offset); + w[48] = amd_bytealign_S (w[41], w[42], offset); + w[47] = amd_bytealign_S (w[40], w[41], offset); + w[46] = amd_bytealign_S (w[39], w[40], offset); + w[45] = amd_bytealign_S (w[38], w[39], offset); + w[44] = amd_bytealign_S (w[37], w[38], offset); + w[43] = amd_bytealign_S (w[36], w[37], offset); + w[42] = amd_bytealign_S (w[35], w[36], offset); + w[41] = amd_bytealign_S (w[34], w[35], offset); + w[40] = amd_bytealign_S (w[33], w[34], offset); + w[39] = amd_bytealign_S (w[32], w[33], offset); + w[38] = amd_bytealign_S (w[31], w[32], offset); + w[37] = amd_bytealign_S (w[30], w[31], offset); + w[36] = amd_bytealign_S (w[29], w[30], offset); + w[35] = amd_bytealign_S (w[28], w[29], offset); + w[34] = amd_bytealign_S (w[27], w[28], offset); + w[33] = amd_bytealign_S (w[26], w[27], offset); + w[32] = amd_bytealign_S (w[25], w[26], offset); + w[31] = amd_bytealign_S (w[24], w[25], offset); + w[30] = amd_bytealign_S (w[23], w[24], offset); + w[29] = amd_bytealign_S (w[22], w[23], offset); + w[28] = amd_bytealign_S (w[21], w[22], offset); + w[27] = amd_bytealign_S (w[20], w[21], offset); + w[26] = amd_bytealign_S (w[19], w[20], offset); + w[25] = amd_bytealign_S (w[18], w[19], offset); + w[24] = amd_bytealign_S (w[17], w[18], offset); + w[23] = amd_bytealign_S (w[16], w[17], offset); + w[22] = amd_bytealign_S (w[15], w[16], offset); + w[21] = amd_bytealign_S (w[14], w[15], offset); + w[20] = amd_bytealign_S (w[13], w[14], offset); + w[19] = amd_bytealign_S (w[12], w[13], offset); + w[18] = amd_bytealign_S (w[11], w[12], offset); + w[17] = amd_bytealign_S (w[10], w[11], offset); + w[16] = amd_bytealign_S (w[ 9], w[10], offset); + w[15] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[14] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[13] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[12] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[11] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[10] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[ 9] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[ 8] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[ 7] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[ 6] = amd_bytealign_S ( 0, w[ 0], offset); + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 7: + w[63] = amd_bytealign_S (w[55], w[56], offset); + w[62] = amd_bytealign_S (w[54], w[55], offset); + w[61] = amd_bytealign_S (w[53], w[54], offset); + w[60] = amd_bytealign_S (w[52], w[53], offset); + w[59] = amd_bytealign_S (w[51], w[52], offset); + w[58] = amd_bytealign_S (w[50], w[51], offset); + w[57] = amd_bytealign_S (w[49], w[50], offset); + w[56] = amd_bytealign_S (w[48], w[49], offset); + w[55] = amd_bytealign_S (w[47], w[48], offset); + w[54] = amd_bytealign_S (w[46], w[47], offset); + w[53] = amd_bytealign_S (w[45], w[46], offset); + w[52] = amd_bytealign_S (w[44], w[45], offset); + w[51] = amd_bytealign_S (w[43], w[44], offset); + w[50] = amd_bytealign_S (w[42], w[43], offset); + w[49] = amd_bytealign_S (w[41], w[42], offset); + w[48] = amd_bytealign_S (w[40], w[41], offset); + w[47] = amd_bytealign_S (w[39], w[40], offset); + w[46] = amd_bytealign_S (w[38], w[39], offset); + w[45] = amd_bytealign_S (w[37], w[38], offset); + w[44] = amd_bytealign_S (w[36], w[37], offset); + w[43] = amd_bytealign_S (w[35], w[36], offset); + w[42] = amd_bytealign_S (w[34], w[35], offset); + w[41] = amd_bytealign_S (w[33], w[34], offset); + w[40] = amd_bytealign_S (w[32], w[33], offset); + w[39] = amd_bytealign_S (w[31], w[32], offset); + w[38] = amd_bytealign_S (w[30], w[31], offset); + w[37] = amd_bytealign_S (w[29], w[30], offset); + w[36] = amd_bytealign_S (w[28], w[29], offset); + w[35] = amd_bytealign_S (w[27], w[28], offset); + w[34] = amd_bytealign_S (w[26], w[27], offset); + w[33] = amd_bytealign_S (w[25], w[26], offset); + w[32] = amd_bytealign_S (w[24], w[25], offset); + w[31] = amd_bytealign_S (w[23], w[24], offset); + w[30] = amd_bytealign_S (w[22], w[23], offset); + w[29] = amd_bytealign_S (w[21], w[22], offset); + w[28] = amd_bytealign_S (w[20], w[21], offset); + w[27] = amd_bytealign_S (w[19], w[20], offset); + w[26] = amd_bytealign_S (w[18], w[19], offset); + w[25] = amd_bytealign_S (w[17], w[18], offset); + w[24] = amd_bytealign_S (w[16], w[17], offset); + w[23] = amd_bytealign_S (w[15], w[16], offset); + w[22] = amd_bytealign_S (w[14], w[15], offset); + w[21] = amd_bytealign_S (w[13], w[14], offset); + w[20] = amd_bytealign_S (w[12], w[13], offset); + w[19] = amd_bytealign_S (w[11], w[12], offset); + w[18] = amd_bytealign_S (w[10], w[11], offset); + w[17] = amd_bytealign_S (w[ 9], w[10], offset); + w[16] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[15] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[14] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[13] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[12] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[11] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[10] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[ 9] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[ 8] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[ 7] = amd_bytealign_S ( 0, w[ 0], offset); + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 8: + w[63] = amd_bytealign_S (w[54], w[55], offset); + w[62] = amd_bytealign_S (w[53], w[54], offset); + w[61] = amd_bytealign_S (w[52], w[53], offset); + w[60] = amd_bytealign_S (w[51], w[52], offset); + w[59] = amd_bytealign_S (w[50], w[51], offset); + w[58] = amd_bytealign_S (w[49], w[50], offset); + w[57] = amd_bytealign_S (w[48], w[49], offset); + w[56] = amd_bytealign_S (w[47], w[48], offset); + w[55] = amd_bytealign_S (w[46], w[47], offset); + w[54] = amd_bytealign_S (w[45], w[46], offset); + w[53] = amd_bytealign_S (w[44], w[45], offset); + w[52] = amd_bytealign_S (w[43], w[44], offset); + w[51] = amd_bytealign_S (w[42], w[43], offset); + w[50] = amd_bytealign_S (w[41], w[42], offset); + w[49] = amd_bytealign_S (w[40], w[41], offset); + w[48] = amd_bytealign_S (w[39], w[40], offset); + w[47] = amd_bytealign_S (w[38], w[39], offset); + w[46] = amd_bytealign_S (w[37], w[38], offset); + w[45] = amd_bytealign_S (w[36], w[37], offset); + w[44] = amd_bytealign_S (w[35], w[36], offset); + w[43] = amd_bytealign_S (w[34], w[35], offset); + w[42] = amd_bytealign_S (w[33], w[34], offset); + w[41] = amd_bytealign_S (w[32], w[33], offset); + w[40] = amd_bytealign_S (w[31], w[32], offset); + w[39] = amd_bytealign_S (w[30], w[31], offset); + w[38] = amd_bytealign_S (w[29], w[30], offset); + w[37] = amd_bytealign_S (w[28], w[29], offset); + w[36] = amd_bytealign_S (w[27], w[28], offset); + w[35] = amd_bytealign_S (w[26], w[27], offset); + w[34] = amd_bytealign_S (w[25], w[26], offset); + w[33] = amd_bytealign_S (w[24], w[25], offset); + w[32] = amd_bytealign_S (w[23], w[24], offset); + w[31] = amd_bytealign_S (w[22], w[23], offset); + w[30] = amd_bytealign_S (w[21], w[22], offset); + w[29] = amd_bytealign_S (w[20], w[21], offset); + w[28] = amd_bytealign_S (w[19], w[20], offset); + w[27] = amd_bytealign_S (w[18], w[19], offset); + w[26] = amd_bytealign_S (w[17], w[18], offset); + w[25] = amd_bytealign_S (w[16], w[17], offset); + w[24] = amd_bytealign_S (w[15], w[16], offset); + w[23] = amd_bytealign_S (w[14], w[15], offset); + w[22] = amd_bytealign_S (w[13], w[14], offset); + w[21] = amd_bytealign_S (w[12], w[13], offset); + w[20] = amd_bytealign_S (w[11], w[12], offset); + w[19] = amd_bytealign_S (w[10], w[11], offset); + w[18] = amd_bytealign_S (w[ 9], w[10], offset); + w[17] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[16] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[15] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[14] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[13] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[12] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[11] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[10] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[ 9] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[ 8] = amd_bytealign_S ( 0, w[ 0], offset); + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 9: + w[63] = amd_bytealign_S (w[53], w[54], offset); + w[62] = amd_bytealign_S (w[52], w[53], offset); + w[61] = amd_bytealign_S (w[51], w[52], offset); + w[60] = amd_bytealign_S (w[50], w[51], offset); + w[59] = amd_bytealign_S (w[49], w[50], offset); + w[58] = amd_bytealign_S (w[48], w[49], offset); + w[57] = amd_bytealign_S (w[47], w[48], offset); + w[56] = amd_bytealign_S (w[46], w[47], offset); + w[55] = amd_bytealign_S (w[45], w[46], offset); + w[54] = amd_bytealign_S (w[44], w[45], offset); + w[53] = amd_bytealign_S (w[43], w[44], offset); + w[52] = amd_bytealign_S (w[42], w[43], offset); + w[51] = amd_bytealign_S (w[41], w[42], offset); + w[50] = amd_bytealign_S (w[40], w[41], offset); + w[49] = amd_bytealign_S (w[39], w[40], offset); + w[48] = amd_bytealign_S (w[38], w[39], offset); + w[47] = amd_bytealign_S (w[37], w[38], offset); + w[46] = amd_bytealign_S (w[36], w[37], offset); + w[45] = amd_bytealign_S (w[35], w[36], offset); + w[44] = amd_bytealign_S (w[34], w[35], offset); + w[43] = amd_bytealign_S (w[33], w[34], offset); + w[42] = amd_bytealign_S (w[32], w[33], offset); + w[41] = amd_bytealign_S (w[31], w[32], offset); + w[40] = amd_bytealign_S (w[30], w[31], offset); + w[39] = amd_bytealign_S (w[29], w[30], offset); + w[38] = amd_bytealign_S (w[28], w[29], offset); + w[37] = amd_bytealign_S (w[27], w[28], offset); + w[36] = amd_bytealign_S (w[26], w[27], offset); + w[35] = amd_bytealign_S (w[25], w[26], offset); + w[34] = amd_bytealign_S (w[24], w[25], offset); + w[33] = amd_bytealign_S (w[23], w[24], offset); + w[32] = amd_bytealign_S (w[22], w[23], offset); + w[31] = amd_bytealign_S (w[21], w[22], offset); + w[30] = amd_bytealign_S (w[20], w[21], offset); + w[29] = amd_bytealign_S (w[19], w[20], offset); + w[28] = amd_bytealign_S (w[18], w[19], offset); + w[27] = amd_bytealign_S (w[17], w[18], offset); + w[26] = amd_bytealign_S (w[16], w[17], offset); + w[25] = amd_bytealign_S (w[15], w[16], offset); + w[24] = amd_bytealign_S (w[14], w[15], offset); + w[23] = amd_bytealign_S (w[13], w[14], offset); + w[22] = amd_bytealign_S (w[12], w[13], offset); + w[21] = amd_bytealign_S (w[11], w[12], offset); + w[20] = amd_bytealign_S (w[10], w[11], offset); + w[19] = amd_bytealign_S (w[ 9], w[10], offset); + w[18] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[17] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[16] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[15] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[14] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[13] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[12] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[11] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[10] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[ 9] = amd_bytealign_S ( 0, w[ 0], offset); + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 10: + w[63] = amd_bytealign_S (w[52], w[53], offset); + w[62] = amd_bytealign_S (w[51], w[52], offset); + w[61] = amd_bytealign_S (w[50], w[51], offset); + w[60] = amd_bytealign_S (w[49], w[50], offset); + w[59] = amd_bytealign_S (w[48], w[49], offset); + w[58] = amd_bytealign_S (w[47], w[48], offset); + w[57] = amd_bytealign_S (w[46], w[47], offset); + w[56] = amd_bytealign_S (w[45], w[46], offset); + w[55] = amd_bytealign_S (w[44], w[45], offset); + w[54] = amd_bytealign_S (w[43], w[44], offset); + w[53] = amd_bytealign_S (w[42], w[43], offset); + w[52] = amd_bytealign_S (w[41], w[42], offset); + w[51] = amd_bytealign_S (w[40], w[41], offset); + w[50] = amd_bytealign_S (w[39], w[40], offset); + w[49] = amd_bytealign_S (w[38], w[39], offset); + w[48] = amd_bytealign_S (w[37], w[38], offset); + w[47] = amd_bytealign_S (w[36], w[37], offset); + w[46] = amd_bytealign_S (w[35], w[36], offset); + w[45] = amd_bytealign_S (w[34], w[35], offset); + w[44] = amd_bytealign_S (w[33], w[34], offset); + w[43] = amd_bytealign_S (w[32], w[33], offset); + w[42] = amd_bytealign_S (w[31], w[32], offset); + w[41] = amd_bytealign_S (w[30], w[31], offset); + w[40] = amd_bytealign_S (w[29], w[30], offset); + w[39] = amd_bytealign_S (w[28], w[29], offset); + w[38] = amd_bytealign_S (w[27], w[28], offset); + w[37] = amd_bytealign_S (w[26], w[27], offset); + w[36] = amd_bytealign_S (w[25], w[26], offset); + w[35] = amd_bytealign_S (w[24], w[25], offset); + w[34] = amd_bytealign_S (w[23], w[24], offset); + w[33] = amd_bytealign_S (w[22], w[23], offset); + w[32] = amd_bytealign_S (w[21], w[22], offset); + w[31] = amd_bytealign_S (w[20], w[21], offset); + w[30] = amd_bytealign_S (w[19], w[20], offset); + w[29] = amd_bytealign_S (w[18], w[19], offset); + w[28] = amd_bytealign_S (w[17], w[18], offset); + w[27] = amd_bytealign_S (w[16], w[17], offset); + w[26] = amd_bytealign_S (w[15], w[16], offset); + w[25] = amd_bytealign_S (w[14], w[15], offset); + w[24] = amd_bytealign_S (w[13], w[14], offset); + w[23] = amd_bytealign_S (w[12], w[13], offset); + w[22] = amd_bytealign_S (w[11], w[12], offset); + w[21] = amd_bytealign_S (w[10], w[11], offset); + w[20] = amd_bytealign_S (w[ 9], w[10], offset); + w[19] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[18] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[17] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[16] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[15] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[14] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[13] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[12] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[11] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[10] = amd_bytealign_S ( 0, w[ 0], offset); + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 11: + w[63] = amd_bytealign_S (w[51], w[52], offset); + w[62] = amd_bytealign_S (w[50], w[51], offset); + w[61] = amd_bytealign_S (w[49], w[50], offset); + w[60] = amd_bytealign_S (w[48], w[49], offset); + w[59] = amd_bytealign_S (w[47], w[48], offset); + w[58] = amd_bytealign_S (w[46], w[47], offset); + w[57] = amd_bytealign_S (w[45], w[46], offset); + w[56] = amd_bytealign_S (w[44], w[45], offset); + w[55] = amd_bytealign_S (w[43], w[44], offset); + w[54] = amd_bytealign_S (w[42], w[43], offset); + w[53] = amd_bytealign_S (w[41], w[42], offset); + w[52] = amd_bytealign_S (w[40], w[41], offset); + w[51] = amd_bytealign_S (w[39], w[40], offset); + w[50] = amd_bytealign_S (w[38], w[39], offset); + w[49] = amd_bytealign_S (w[37], w[38], offset); + w[48] = amd_bytealign_S (w[36], w[37], offset); + w[47] = amd_bytealign_S (w[35], w[36], offset); + w[46] = amd_bytealign_S (w[34], w[35], offset); + w[45] = amd_bytealign_S (w[33], w[34], offset); + w[44] = amd_bytealign_S (w[32], w[33], offset); + w[43] = amd_bytealign_S (w[31], w[32], offset); + w[42] = amd_bytealign_S (w[30], w[31], offset); + w[41] = amd_bytealign_S (w[29], w[30], offset); + w[40] = amd_bytealign_S (w[28], w[29], offset); + w[39] = amd_bytealign_S (w[27], w[28], offset); + w[38] = amd_bytealign_S (w[26], w[27], offset); + w[37] = amd_bytealign_S (w[25], w[26], offset); + w[36] = amd_bytealign_S (w[24], w[25], offset); + w[35] = amd_bytealign_S (w[23], w[24], offset); + w[34] = amd_bytealign_S (w[22], w[23], offset); + w[33] = amd_bytealign_S (w[21], w[22], offset); + w[32] = amd_bytealign_S (w[20], w[21], offset); + w[31] = amd_bytealign_S (w[19], w[20], offset); + w[30] = amd_bytealign_S (w[18], w[19], offset); + w[29] = amd_bytealign_S (w[17], w[18], offset); + w[28] = amd_bytealign_S (w[16], w[17], offset); + w[27] = amd_bytealign_S (w[15], w[16], offset); + w[26] = amd_bytealign_S (w[14], w[15], offset); + w[25] = amd_bytealign_S (w[13], w[14], offset); + w[24] = amd_bytealign_S (w[12], w[13], offset); + w[23] = amd_bytealign_S (w[11], w[12], offset); + w[22] = amd_bytealign_S (w[10], w[11], offset); + w[21] = amd_bytealign_S (w[ 9], w[10], offset); + w[20] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[19] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[18] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[17] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[16] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[15] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[14] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[13] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[12] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[11] = amd_bytealign_S ( 0, w[ 0], offset); + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 12: + w[63] = amd_bytealign_S (w[50], w[51], offset); + w[62] = amd_bytealign_S (w[49], w[50], offset); + w[61] = amd_bytealign_S (w[48], w[49], offset); + w[60] = amd_bytealign_S (w[47], w[48], offset); + w[59] = amd_bytealign_S (w[46], w[47], offset); + w[58] = amd_bytealign_S (w[45], w[46], offset); + w[57] = amd_bytealign_S (w[44], w[45], offset); + w[56] = amd_bytealign_S (w[43], w[44], offset); + w[55] = amd_bytealign_S (w[42], w[43], offset); + w[54] = amd_bytealign_S (w[41], w[42], offset); + w[53] = amd_bytealign_S (w[40], w[41], offset); + w[52] = amd_bytealign_S (w[39], w[40], offset); + w[51] = amd_bytealign_S (w[38], w[39], offset); + w[50] = amd_bytealign_S (w[37], w[38], offset); + w[49] = amd_bytealign_S (w[36], w[37], offset); + w[48] = amd_bytealign_S (w[35], w[36], offset); + w[47] = amd_bytealign_S (w[34], w[35], offset); + w[46] = amd_bytealign_S (w[33], w[34], offset); + w[45] = amd_bytealign_S (w[32], w[33], offset); + w[44] = amd_bytealign_S (w[31], w[32], offset); + w[43] = amd_bytealign_S (w[30], w[31], offset); + w[42] = amd_bytealign_S (w[29], w[30], offset); + w[41] = amd_bytealign_S (w[28], w[29], offset); + w[40] = amd_bytealign_S (w[27], w[28], offset); + w[39] = amd_bytealign_S (w[26], w[27], offset); + w[38] = amd_bytealign_S (w[25], w[26], offset); + w[37] = amd_bytealign_S (w[24], w[25], offset); + w[36] = amd_bytealign_S (w[23], w[24], offset); + w[35] = amd_bytealign_S (w[22], w[23], offset); + w[34] = amd_bytealign_S (w[21], w[22], offset); + w[33] = amd_bytealign_S (w[20], w[21], offset); + w[32] = amd_bytealign_S (w[19], w[20], offset); + w[31] = amd_bytealign_S (w[18], w[19], offset); + w[30] = amd_bytealign_S (w[17], w[18], offset); + w[29] = amd_bytealign_S (w[16], w[17], offset); + w[28] = amd_bytealign_S (w[15], w[16], offset); + w[27] = amd_bytealign_S (w[14], w[15], offset); + w[26] = amd_bytealign_S (w[13], w[14], offset); + w[25] = amd_bytealign_S (w[12], w[13], offset); + w[24] = amd_bytealign_S (w[11], w[12], offset); + w[23] = amd_bytealign_S (w[10], w[11], offset); + w[22] = amd_bytealign_S (w[ 9], w[10], offset); + w[21] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[20] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[19] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[18] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[17] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[16] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[15] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[14] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[13] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[12] = amd_bytealign_S ( 0, w[ 0], offset); + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 13: + w[63] = amd_bytealign_S (w[49], w[50], offset); + w[62] = amd_bytealign_S (w[48], w[49], offset); + w[61] = amd_bytealign_S (w[47], w[48], offset); + w[60] = amd_bytealign_S (w[46], w[47], offset); + w[59] = amd_bytealign_S (w[45], w[46], offset); + w[58] = amd_bytealign_S (w[44], w[45], offset); + w[57] = amd_bytealign_S (w[43], w[44], offset); + w[56] = amd_bytealign_S (w[42], w[43], offset); + w[55] = amd_bytealign_S (w[41], w[42], offset); + w[54] = amd_bytealign_S (w[40], w[41], offset); + w[53] = amd_bytealign_S (w[39], w[40], offset); + w[52] = amd_bytealign_S (w[38], w[39], offset); + w[51] = amd_bytealign_S (w[37], w[38], offset); + w[50] = amd_bytealign_S (w[36], w[37], offset); + w[49] = amd_bytealign_S (w[35], w[36], offset); + w[48] = amd_bytealign_S (w[34], w[35], offset); + w[47] = amd_bytealign_S (w[33], w[34], offset); + w[46] = amd_bytealign_S (w[32], w[33], offset); + w[45] = amd_bytealign_S (w[31], w[32], offset); + w[44] = amd_bytealign_S (w[30], w[31], offset); + w[43] = amd_bytealign_S (w[29], w[30], offset); + w[42] = amd_bytealign_S (w[28], w[29], offset); + w[41] = amd_bytealign_S (w[27], w[28], offset); + w[40] = amd_bytealign_S (w[26], w[27], offset); + w[39] = amd_bytealign_S (w[25], w[26], offset); + w[38] = amd_bytealign_S (w[24], w[25], offset); + w[37] = amd_bytealign_S (w[23], w[24], offset); + w[36] = amd_bytealign_S (w[22], w[23], offset); + w[35] = amd_bytealign_S (w[21], w[22], offset); + w[34] = amd_bytealign_S (w[20], w[21], offset); + w[33] = amd_bytealign_S (w[19], w[20], offset); + w[32] = amd_bytealign_S (w[18], w[19], offset); + w[31] = amd_bytealign_S (w[17], w[18], offset); + w[30] = amd_bytealign_S (w[16], w[17], offset); + w[29] = amd_bytealign_S (w[15], w[16], offset); + w[28] = amd_bytealign_S (w[14], w[15], offset); + w[27] = amd_bytealign_S (w[13], w[14], offset); + w[26] = amd_bytealign_S (w[12], w[13], offset); + w[25] = amd_bytealign_S (w[11], w[12], offset); + w[24] = amd_bytealign_S (w[10], w[11], offset); + w[23] = amd_bytealign_S (w[ 9], w[10], offset); + w[22] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[21] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[20] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[19] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[18] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[17] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[16] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[15] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[14] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[13] = amd_bytealign_S ( 0, w[ 0], offset); + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 14: + w[63] = amd_bytealign_S (w[48], w[49], offset); + w[62] = amd_bytealign_S (w[47], w[48], offset); + w[61] = amd_bytealign_S (w[46], w[47], offset); + w[60] = amd_bytealign_S (w[45], w[46], offset); + w[59] = amd_bytealign_S (w[44], w[45], offset); + w[58] = amd_bytealign_S (w[43], w[44], offset); + w[57] = amd_bytealign_S (w[42], w[43], offset); + w[56] = amd_bytealign_S (w[41], w[42], offset); + w[55] = amd_bytealign_S (w[40], w[41], offset); + w[54] = amd_bytealign_S (w[39], w[40], offset); + w[53] = amd_bytealign_S (w[38], w[39], offset); + w[52] = amd_bytealign_S (w[37], w[38], offset); + w[51] = amd_bytealign_S (w[36], w[37], offset); + w[50] = amd_bytealign_S (w[35], w[36], offset); + w[49] = amd_bytealign_S (w[34], w[35], offset); + w[48] = amd_bytealign_S (w[33], w[34], offset); + w[47] = amd_bytealign_S (w[32], w[33], offset); + w[46] = amd_bytealign_S (w[31], w[32], offset); + w[45] = amd_bytealign_S (w[30], w[31], offset); + w[44] = amd_bytealign_S (w[29], w[30], offset); + w[43] = amd_bytealign_S (w[28], w[29], offset); + w[42] = amd_bytealign_S (w[27], w[28], offset); + w[41] = amd_bytealign_S (w[26], w[27], offset); + w[40] = amd_bytealign_S (w[25], w[26], offset); + w[39] = amd_bytealign_S (w[24], w[25], offset); + w[38] = amd_bytealign_S (w[23], w[24], offset); + w[37] = amd_bytealign_S (w[22], w[23], offset); + w[36] = amd_bytealign_S (w[21], w[22], offset); + w[35] = amd_bytealign_S (w[20], w[21], offset); + w[34] = amd_bytealign_S (w[19], w[20], offset); + w[33] = amd_bytealign_S (w[18], w[19], offset); + w[32] = amd_bytealign_S (w[17], w[18], offset); + w[31] = amd_bytealign_S (w[16], w[17], offset); + w[30] = amd_bytealign_S (w[15], w[16], offset); + w[29] = amd_bytealign_S (w[14], w[15], offset); + w[28] = amd_bytealign_S (w[13], w[14], offset); + w[27] = amd_bytealign_S (w[12], w[13], offset); + w[26] = amd_bytealign_S (w[11], w[12], offset); + w[25] = amd_bytealign_S (w[10], w[11], offset); + w[24] = amd_bytealign_S (w[ 9], w[10], offset); + w[23] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[22] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[21] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[20] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[19] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[18] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[17] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[16] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[15] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[14] = amd_bytealign_S ( 0, w[ 0], offset); + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 15: + w[63] = amd_bytealign_S (w[47], w[48], offset); + w[62] = amd_bytealign_S (w[46], w[47], offset); + w[61] = amd_bytealign_S (w[45], w[46], offset); + w[60] = amd_bytealign_S (w[44], w[45], offset); + w[59] = amd_bytealign_S (w[43], w[44], offset); + w[58] = amd_bytealign_S (w[42], w[43], offset); + w[57] = amd_bytealign_S (w[41], w[42], offset); + w[56] = amd_bytealign_S (w[40], w[41], offset); + w[55] = amd_bytealign_S (w[39], w[40], offset); + w[54] = amd_bytealign_S (w[38], w[39], offset); + w[53] = amd_bytealign_S (w[37], w[38], offset); + w[52] = amd_bytealign_S (w[36], w[37], offset); + w[51] = amd_bytealign_S (w[35], w[36], offset); + w[50] = amd_bytealign_S (w[34], w[35], offset); + w[49] = amd_bytealign_S (w[33], w[34], offset); + w[48] = amd_bytealign_S (w[32], w[33], offset); + w[47] = amd_bytealign_S (w[31], w[32], offset); + w[46] = amd_bytealign_S (w[30], w[31], offset); + w[45] = amd_bytealign_S (w[29], w[30], offset); + w[44] = amd_bytealign_S (w[28], w[29], offset); + w[43] = amd_bytealign_S (w[27], w[28], offset); + w[42] = amd_bytealign_S (w[26], w[27], offset); + w[41] = amd_bytealign_S (w[25], w[26], offset); + w[40] = amd_bytealign_S (w[24], w[25], offset); + w[39] = amd_bytealign_S (w[23], w[24], offset); + w[38] = amd_bytealign_S (w[22], w[23], offset); + w[37] = amd_bytealign_S (w[21], w[22], offset); + w[36] = amd_bytealign_S (w[20], w[21], offset); + w[35] = amd_bytealign_S (w[19], w[20], offset); + w[34] = amd_bytealign_S (w[18], w[19], offset); + w[33] = amd_bytealign_S (w[17], w[18], offset); + w[32] = amd_bytealign_S (w[16], w[17], offset); + w[31] = amd_bytealign_S (w[15], w[16], offset); + w[30] = amd_bytealign_S (w[14], w[15], offset); + w[29] = amd_bytealign_S (w[13], w[14], offset); + w[28] = amd_bytealign_S (w[12], w[13], offset); + w[27] = amd_bytealign_S (w[11], w[12], offset); + w[26] = amd_bytealign_S (w[10], w[11], offset); + w[25] = amd_bytealign_S (w[ 9], w[10], offset); + w[24] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[23] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[22] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[21] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[20] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[19] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[18] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[17] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[16] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[15] = amd_bytealign_S ( 0, w[ 0], offset); + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 16: + w[63] = amd_bytealign_S (w[46], w[47], offset); + w[62] = amd_bytealign_S (w[45], w[46], offset); + w[61] = amd_bytealign_S (w[44], w[45], offset); + w[60] = amd_bytealign_S (w[43], w[44], offset); + w[59] = amd_bytealign_S (w[42], w[43], offset); + w[58] = amd_bytealign_S (w[41], w[42], offset); + w[57] = amd_bytealign_S (w[40], w[41], offset); + w[56] = amd_bytealign_S (w[39], w[40], offset); + w[55] = amd_bytealign_S (w[38], w[39], offset); + w[54] = amd_bytealign_S (w[37], w[38], offset); + w[53] = amd_bytealign_S (w[36], w[37], offset); + w[52] = amd_bytealign_S (w[35], w[36], offset); + w[51] = amd_bytealign_S (w[34], w[35], offset); + w[50] = amd_bytealign_S (w[33], w[34], offset); + w[49] = amd_bytealign_S (w[32], w[33], offset); + w[48] = amd_bytealign_S (w[31], w[32], offset); + w[47] = amd_bytealign_S (w[30], w[31], offset); + w[46] = amd_bytealign_S (w[29], w[30], offset); + w[45] = amd_bytealign_S (w[28], w[29], offset); + w[44] = amd_bytealign_S (w[27], w[28], offset); + w[43] = amd_bytealign_S (w[26], w[27], offset); + w[42] = amd_bytealign_S (w[25], w[26], offset); + w[41] = amd_bytealign_S (w[24], w[25], offset); + w[40] = amd_bytealign_S (w[23], w[24], offset); + w[39] = amd_bytealign_S (w[22], w[23], offset); + w[38] = amd_bytealign_S (w[21], w[22], offset); + w[37] = amd_bytealign_S (w[20], w[21], offset); + w[36] = amd_bytealign_S (w[19], w[20], offset); + w[35] = amd_bytealign_S (w[18], w[19], offset); + w[34] = amd_bytealign_S (w[17], w[18], offset); + w[33] = amd_bytealign_S (w[16], w[17], offset); + w[32] = amd_bytealign_S (w[15], w[16], offset); + w[31] = amd_bytealign_S (w[14], w[15], offset); + w[30] = amd_bytealign_S (w[13], w[14], offset); + w[29] = amd_bytealign_S (w[12], w[13], offset); + w[28] = amd_bytealign_S (w[11], w[12], offset); + w[27] = amd_bytealign_S (w[10], w[11], offset); + w[26] = amd_bytealign_S (w[ 9], w[10], offset); + w[25] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[24] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[23] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[22] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[21] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[20] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[19] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[18] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[17] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[16] = amd_bytealign_S ( 0, w[ 0], offset); + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 17: + w[63] = amd_bytealign_S (w[45], w[46], offset); + w[62] = amd_bytealign_S (w[44], w[45], offset); + w[61] = amd_bytealign_S (w[43], w[44], offset); + w[60] = amd_bytealign_S (w[42], w[43], offset); + w[59] = amd_bytealign_S (w[41], w[42], offset); + w[58] = amd_bytealign_S (w[40], w[41], offset); + w[57] = amd_bytealign_S (w[39], w[40], offset); + w[56] = amd_bytealign_S (w[38], w[39], offset); + w[55] = amd_bytealign_S (w[37], w[38], offset); + w[54] = amd_bytealign_S (w[36], w[37], offset); + w[53] = amd_bytealign_S (w[35], w[36], offset); + w[52] = amd_bytealign_S (w[34], w[35], offset); + w[51] = amd_bytealign_S (w[33], w[34], offset); + w[50] = amd_bytealign_S (w[32], w[33], offset); + w[49] = amd_bytealign_S (w[31], w[32], offset); + w[48] = amd_bytealign_S (w[30], w[31], offset); + w[47] = amd_bytealign_S (w[29], w[30], offset); + w[46] = amd_bytealign_S (w[28], w[29], offset); + w[45] = amd_bytealign_S (w[27], w[28], offset); + w[44] = amd_bytealign_S (w[26], w[27], offset); + w[43] = amd_bytealign_S (w[25], w[26], offset); + w[42] = amd_bytealign_S (w[24], w[25], offset); + w[41] = amd_bytealign_S (w[23], w[24], offset); + w[40] = amd_bytealign_S (w[22], w[23], offset); + w[39] = amd_bytealign_S (w[21], w[22], offset); + w[38] = amd_bytealign_S (w[20], w[21], offset); + w[37] = amd_bytealign_S (w[19], w[20], offset); + w[36] = amd_bytealign_S (w[18], w[19], offset); + w[35] = amd_bytealign_S (w[17], w[18], offset); + w[34] = amd_bytealign_S (w[16], w[17], offset); + w[33] = amd_bytealign_S (w[15], w[16], offset); + w[32] = amd_bytealign_S (w[14], w[15], offset); + w[31] = amd_bytealign_S (w[13], w[14], offset); + w[30] = amd_bytealign_S (w[12], w[13], offset); + w[29] = amd_bytealign_S (w[11], w[12], offset); + w[28] = amd_bytealign_S (w[10], w[11], offset); + w[27] = amd_bytealign_S (w[ 9], w[10], offset); + w[26] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[25] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[24] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[23] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[22] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[21] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[20] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[19] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[18] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[17] = amd_bytealign_S ( 0, w[ 0], offset); + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 18: + w[63] = amd_bytealign_S (w[44], w[45], offset); + w[62] = amd_bytealign_S (w[43], w[44], offset); + w[61] = amd_bytealign_S (w[42], w[43], offset); + w[60] = amd_bytealign_S (w[41], w[42], offset); + w[59] = amd_bytealign_S (w[40], w[41], offset); + w[58] = amd_bytealign_S (w[39], w[40], offset); + w[57] = amd_bytealign_S (w[38], w[39], offset); + w[56] = amd_bytealign_S (w[37], w[38], offset); + w[55] = amd_bytealign_S (w[36], w[37], offset); + w[54] = amd_bytealign_S (w[35], w[36], offset); + w[53] = amd_bytealign_S (w[34], w[35], offset); + w[52] = amd_bytealign_S (w[33], w[34], offset); + w[51] = amd_bytealign_S (w[32], w[33], offset); + w[50] = amd_bytealign_S (w[31], w[32], offset); + w[49] = amd_bytealign_S (w[30], w[31], offset); + w[48] = amd_bytealign_S (w[29], w[30], offset); + w[47] = amd_bytealign_S (w[28], w[29], offset); + w[46] = amd_bytealign_S (w[27], w[28], offset); + w[45] = amd_bytealign_S (w[26], w[27], offset); + w[44] = amd_bytealign_S (w[25], w[26], offset); + w[43] = amd_bytealign_S (w[24], w[25], offset); + w[42] = amd_bytealign_S (w[23], w[24], offset); + w[41] = amd_bytealign_S (w[22], w[23], offset); + w[40] = amd_bytealign_S (w[21], w[22], offset); + w[39] = amd_bytealign_S (w[20], w[21], offset); + w[38] = amd_bytealign_S (w[19], w[20], offset); + w[37] = amd_bytealign_S (w[18], w[19], offset); + w[36] = amd_bytealign_S (w[17], w[18], offset); + w[35] = amd_bytealign_S (w[16], w[17], offset); + w[34] = amd_bytealign_S (w[15], w[16], offset); + w[33] = amd_bytealign_S (w[14], w[15], offset); + w[32] = amd_bytealign_S (w[13], w[14], offset); + w[31] = amd_bytealign_S (w[12], w[13], offset); + w[30] = amd_bytealign_S (w[11], w[12], offset); + w[29] = amd_bytealign_S (w[10], w[11], offset); + w[28] = amd_bytealign_S (w[ 9], w[10], offset); + w[27] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[26] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[25] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[24] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[23] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[22] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[21] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[20] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[19] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[18] = amd_bytealign_S ( 0, w[ 0], offset); + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 19: + w[63] = amd_bytealign_S (w[43], w[44], offset); + w[62] = amd_bytealign_S (w[42], w[43], offset); + w[61] = amd_bytealign_S (w[41], w[42], offset); + w[60] = amd_bytealign_S (w[40], w[41], offset); + w[59] = amd_bytealign_S (w[39], w[40], offset); + w[58] = amd_bytealign_S (w[38], w[39], offset); + w[57] = amd_bytealign_S (w[37], w[38], offset); + w[56] = amd_bytealign_S (w[36], w[37], offset); + w[55] = amd_bytealign_S (w[35], w[36], offset); + w[54] = amd_bytealign_S (w[34], w[35], offset); + w[53] = amd_bytealign_S (w[33], w[34], offset); + w[52] = amd_bytealign_S (w[32], w[33], offset); + w[51] = amd_bytealign_S (w[31], w[32], offset); + w[50] = amd_bytealign_S (w[30], w[31], offset); + w[49] = amd_bytealign_S (w[29], w[30], offset); + w[48] = amd_bytealign_S (w[28], w[29], offset); + w[47] = amd_bytealign_S (w[27], w[28], offset); + w[46] = amd_bytealign_S (w[26], w[27], offset); + w[45] = amd_bytealign_S (w[25], w[26], offset); + w[44] = amd_bytealign_S (w[24], w[25], offset); + w[43] = amd_bytealign_S (w[23], w[24], offset); + w[42] = amd_bytealign_S (w[22], w[23], offset); + w[41] = amd_bytealign_S (w[21], w[22], offset); + w[40] = amd_bytealign_S (w[20], w[21], offset); + w[39] = amd_bytealign_S (w[19], w[20], offset); + w[38] = amd_bytealign_S (w[18], w[19], offset); + w[37] = amd_bytealign_S (w[17], w[18], offset); + w[36] = amd_bytealign_S (w[16], w[17], offset); + w[35] = amd_bytealign_S (w[15], w[16], offset); + w[34] = amd_bytealign_S (w[14], w[15], offset); + w[33] = amd_bytealign_S (w[13], w[14], offset); + w[32] = amd_bytealign_S (w[12], w[13], offset); + w[31] = amd_bytealign_S (w[11], w[12], offset); + w[30] = amd_bytealign_S (w[10], w[11], offset); + w[29] = amd_bytealign_S (w[ 9], w[10], offset); + w[28] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[27] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[26] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[25] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[24] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[23] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[22] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[21] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[20] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[19] = amd_bytealign_S ( 0, w[ 0], offset); + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 20: + w[63] = amd_bytealign_S (w[42], w[43], offset); + w[62] = amd_bytealign_S (w[41], w[42], offset); + w[61] = amd_bytealign_S (w[40], w[41], offset); + w[60] = amd_bytealign_S (w[39], w[40], offset); + w[59] = amd_bytealign_S (w[38], w[39], offset); + w[58] = amd_bytealign_S (w[37], w[38], offset); + w[57] = amd_bytealign_S (w[36], w[37], offset); + w[56] = amd_bytealign_S (w[35], w[36], offset); + w[55] = amd_bytealign_S (w[34], w[35], offset); + w[54] = amd_bytealign_S (w[33], w[34], offset); + w[53] = amd_bytealign_S (w[32], w[33], offset); + w[52] = amd_bytealign_S (w[31], w[32], offset); + w[51] = amd_bytealign_S (w[30], w[31], offset); + w[50] = amd_bytealign_S (w[29], w[30], offset); + w[49] = amd_bytealign_S (w[28], w[29], offset); + w[48] = amd_bytealign_S (w[27], w[28], offset); + w[47] = amd_bytealign_S (w[26], w[27], offset); + w[46] = amd_bytealign_S (w[25], w[26], offset); + w[45] = amd_bytealign_S (w[24], w[25], offset); + w[44] = amd_bytealign_S (w[23], w[24], offset); + w[43] = amd_bytealign_S (w[22], w[23], offset); + w[42] = amd_bytealign_S (w[21], w[22], offset); + w[41] = amd_bytealign_S (w[20], w[21], offset); + w[40] = amd_bytealign_S (w[19], w[20], offset); + w[39] = amd_bytealign_S (w[18], w[19], offset); + w[38] = amd_bytealign_S (w[17], w[18], offset); + w[37] = amd_bytealign_S (w[16], w[17], offset); + w[36] = amd_bytealign_S (w[15], w[16], offset); + w[35] = amd_bytealign_S (w[14], w[15], offset); + w[34] = amd_bytealign_S (w[13], w[14], offset); + w[33] = amd_bytealign_S (w[12], w[13], offset); + w[32] = amd_bytealign_S (w[11], w[12], offset); + w[31] = amd_bytealign_S (w[10], w[11], offset); + w[30] = amd_bytealign_S (w[ 9], w[10], offset); + w[29] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[28] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[27] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[26] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[25] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[24] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[23] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[22] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[21] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[20] = amd_bytealign_S ( 0, w[ 0], offset); + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 21: + w[63] = amd_bytealign_S (w[41], w[42], offset); + w[62] = amd_bytealign_S (w[40], w[41], offset); + w[61] = amd_bytealign_S (w[39], w[40], offset); + w[60] = amd_bytealign_S (w[38], w[39], offset); + w[59] = amd_bytealign_S (w[37], w[38], offset); + w[58] = amd_bytealign_S (w[36], w[37], offset); + w[57] = amd_bytealign_S (w[35], w[36], offset); + w[56] = amd_bytealign_S (w[34], w[35], offset); + w[55] = amd_bytealign_S (w[33], w[34], offset); + w[54] = amd_bytealign_S (w[32], w[33], offset); + w[53] = amd_bytealign_S (w[31], w[32], offset); + w[52] = amd_bytealign_S (w[30], w[31], offset); + w[51] = amd_bytealign_S (w[29], w[30], offset); + w[50] = amd_bytealign_S (w[28], w[29], offset); + w[49] = amd_bytealign_S (w[27], w[28], offset); + w[48] = amd_bytealign_S (w[26], w[27], offset); + w[47] = amd_bytealign_S (w[25], w[26], offset); + w[46] = amd_bytealign_S (w[24], w[25], offset); + w[45] = amd_bytealign_S (w[23], w[24], offset); + w[44] = amd_bytealign_S (w[22], w[23], offset); + w[43] = amd_bytealign_S (w[21], w[22], offset); + w[42] = amd_bytealign_S (w[20], w[21], offset); + w[41] = amd_bytealign_S (w[19], w[20], offset); + w[40] = amd_bytealign_S (w[18], w[19], offset); + w[39] = amd_bytealign_S (w[17], w[18], offset); + w[38] = amd_bytealign_S (w[16], w[17], offset); + w[37] = amd_bytealign_S (w[15], w[16], offset); + w[36] = amd_bytealign_S (w[14], w[15], offset); + w[35] = amd_bytealign_S (w[13], w[14], offset); + w[34] = amd_bytealign_S (w[12], w[13], offset); + w[33] = amd_bytealign_S (w[11], w[12], offset); + w[32] = amd_bytealign_S (w[10], w[11], offset); + w[31] = amd_bytealign_S (w[ 9], w[10], offset); + w[30] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[29] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[28] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[27] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[26] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[25] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[24] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[23] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[22] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[21] = amd_bytealign_S ( 0, w[ 0], offset); + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 22: + w[63] = amd_bytealign_S (w[40], w[41], offset); + w[62] = amd_bytealign_S (w[39], w[40], offset); + w[61] = amd_bytealign_S (w[38], w[39], offset); + w[60] = amd_bytealign_S (w[37], w[38], offset); + w[59] = amd_bytealign_S (w[36], w[37], offset); + w[58] = amd_bytealign_S (w[35], w[36], offset); + w[57] = amd_bytealign_S (w[34], w[35], offset); + w[56] = amd_bytealign_S (w[33], w[34], offset); + w[55] = amd_bytealign_S (w[32], w[33], offset); + w[54] = amd_bytealign_S (w[31], w[32], offset); + w[53] = amd_bytealign_S (w[30], w[31], offset); + w[52] = amd_bytealign_S (w[29], w[30], offset); + w[51] = amd_bytealign_S (w[28], w[29], offset); + w[50] = amd_bytealign_S (w[27], w[28], offset); + w[49] = amd_bytealign_S (w[26], w[27], offset); + w[48] = amd_bytealign_S (w[25], w[26], offset); + w[47] = amd_bytealign_S (w[24], w[25], offset); + w[46] = amd_bytealign_S (w[23], w[24], offset); + w[45] = amd_bytealign_S (w[22], w[23], offset); + w[44] = amd_bytealign_S (w[21], w[22], offset); + w[43] = amd_bytealign_S (w[20], w[21], offset); + w[42] = amd_bytealign_S (w[19], w[20], offset); + w[41] = amd_bytealign_S (w[18], w[19], offset); + w[40] = amd_bytealign_S (w[17], w[18], offset); + w[39] = amd_bytealign_S (w[16], w[17], offset); + w[38] = amd_bytealign_S (w[15], w[16], offset); + w[37] = amd_bytealign_S (w[14], w[15], offset); + w[36] = amd_bytealign_S (w[13], w[14], offset); + w[35] = amd_bytealign_S (w[12], w[13], offset); + w[34] = amd_bytealign_S (w[11], w[12], offset); + w[33] = amd_bytealign_S (w[10], w[11], offset); + w[32] = amd_bytealign_S (w[ 9], w[10], offset); + w[31] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[30] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[29] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[28] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[27] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[26] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[25] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[24] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[23] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[22] = amd_bytealign_S ( 0, w[ 0], offset); + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 23: + w[63] = amd_bytealign_S (w[39], w[40], offset); + w[62] = amd_bytealign_S (w[38], w[39], offset); + w[61] = amd_bytealign_S (w[37], w[38], offset); + w[60] = amd_bytealign_S (w[36], w[37], offset); + w[59] = amd_bytealign_S (w[35], w[36], offset); + w[58] = amd_bytealign_S (w[34], w[35], offset); + w[57] = amd_bytealign_S (w[33], w[34], offset); + w[56] = amd_bytealign_S (w[32], w[33], offset); + w[55] = amd_bytealign_S (w[31], w[32], offset); + w[54] = amd_bytealign_S (w[30], w[31], offset); + w[53] = amd_bytealign_S (w[29], w[30], offset); + w[52] = amd_bytealign_S (w[28], w[29], offset); + w[51] = amd_bytealign_S (w[27], w[28], offset); + w[50] = amd_bytealign_S (w[26], w[27], offset); + w[49] = amd_bytealign_S (w[25], w[26], offset); + w[48] = amd_bytealign_S (w[24], w[25], offset); + w[47] = amd_bytealign_S (w[23], w[24], offset); + w[46] = amd_bytealign_S (w[22], w[23], offset); + w[45] = amd_bytealign_S (w[21], w[22], offset); + w[44] = amd_bytealign_S (w[20], w[21], offset); + w[43] = amd_bytealign_S (w[19], w[20], offset); + w[42] = amd_bytealign_S (w[18], w[19], offset); + w[41] = amd_bytealign_S (w[17], w[18], offset); + w[40] = amd_bytealign_S (w[16], w[17], offset); + w[39] = amd_bytealign_S (w[15], w[16], offset); + w[38] = amd_bytealign_S (w[14], w[15], offset); + w[37] = amd_bytealign_S (w[13], w[14], offset); + w[36] = amd_bytealign_S (w[12], w[13], offset); + w[35] = amd_bytealign_S (w[11], w[12], offset); + w[34] = amd_bytealign_S (w[10], w[11], offset); + w[33] = amd_bytealign_S (w[ 9], w[10], offset); + w[32] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[31] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[30] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[29] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[28] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[27] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[26] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[25] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[24] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[23] = amd_bytealign_S ( 0, w[ 0], offset); + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 24: + w[63] = amd_bytealign_S (w[38], w[39], offset); + w[62] = amd_bytealign_S (w[37], w[38], offset); + w[61] = amd_bytealign_S (w[36], w[37], offset); + w[60] = amd_bytealign_S (w[35], w[36], offset); + w[59] = amd_bytealign_S (w[34], w[35], offset); + w[58] = amd_bytealign_S (w[33], w[34], offset); + w[57] = amd_bytealign_S (w[32], w[33], offset); + w[56] = amd_bytealign_S (w[31], w[32], offset); + w[55] = amd_bytealign_S (w[30], w[31], offset); + w[54] = amd_bytealign_S (w[29], w[30], offset); + w[53] = amd_bytealign_S (w[28], w[29], offset); + w[52] = amd_bytealign_S (w[27], w[28], offset); + w[51] = amd_bytealign_S (w[26], w[27], offset); + w[50] = amd_bytealign_S (w[25], w[26], offset); + w[49] = amd_bytealign_S (w[24], w[25], offset); + w[48] = amd_bytealign_S (w[23], w[24], offset); + w[47] = amd_bytealign_S (w[22], w[23], offset); + w[46] = amd_bytealign_S (w[21], w[22], offset); + w[45] = amd_bytealign_S (w[20], w[21], offset); + w[44] = amd_bytealign_S (w[19], w[20], offset); + w[43] = amd_bytealign_S (w[18], w[19], offset); + w[42] = amd_bytealign_S (w[17], w[18], offset); + w[41] = amd_bytealign_S (w[16], w[17], offset); + w[40] = amd_bytealign_S (w[15], w[16], offset); + w[39] = amd_bytealign_S (w[14], w[15], offset); + w[38] = amd_bytealign_S (w[13], w[14], offset); + w[37] = amd_bytealign_S (w[12], w[13], offset); + w[36] = amd_bytealign_S (w[11], w[12], offset); + w[35] = amd_bytealign_S (w[10], w[11], offset); + w[34] = amd_bytealign_S (w[ 9], w[10], offset); + w[33] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[32] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[31] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[30] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[29] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[28] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[27] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[26] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[25] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[24] = amd_bytealign_S ( 0, w[ 0], offset); + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 25: + w[63] = amd_bytealign_S (w[37], w[38], offset); + w[62] = amd_bytealign_S (w[36], w[37], offset); + w[61] = amd_bytealign_S (w[35], w[36], offset); + w[60] = amd_bytealign_S (w[34], w[35], offset); + w[59] = amd_bytealign_S (w[33], w[34], offset); + w[58] = amd_bytealign_S (w[32], w[33], offset); + w[57] = amd_bytealign_S (w[31], w[32], offset); + w[56] = amd_bytealign_S (w[30], w[31], offset); + w[55] = amd_bytealign_S (w[29], w[30], offset); + w[54] = amd_bytealign_S (w[28], w[29], offset); + w[53] = amd_bytealign_S (w[27], w[28], offset); + w[52] = amd_bytealign_S (w[26], w[27], offset); + w[51] = amd_bytealign_S (w[25], w[26], offset); + w[50] = amd_bytealign_S (w[24], w[25], offset); + w[49] = amd_bytealign_S (w[23], w[24], offset); + w[48] = amd_bytealign_S (w[22], w[23], offset); + w[47] = amd_bytealign_S (w[21], w[22], offset); + w[46] = amd_bytealign_S (w[20], w[21], offset); + w[45] = amd_bytealign_S (w[19], w[20], offset); + w[44] = amd_bytealign_S (w[18], w[19], offset); + w[43] = amd_bytealign_S (w[17], w[18], offset); + w[42] = amd_bytealign_S (w[16], w[17], offset); + w[41] = amd_bytealign_S (w[15], w[16], offset); + w[40] = amd_bytealign_S (w[14], w[15], offset); + w[39] = amd_bytealign_S (w[13], w[14], offset); + w[38] = amd_bytealign_S (w[12], w[13], offset); + w[37] = amd_bytealign_S (w[11], w[12], offset); + w[36] = amd_bytealign_S (w[10], w[11], offset); + w[35] = amd_bytealign_S (w[ 9], w[10], offset); + w[34] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[33] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[32] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[31] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[30] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[29] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[28] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[27] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[26] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[25] = amd_bytealign_S ( 0, w[ 0], offset); + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 26: + w[63] = amd_bytealign_S (w[36], w[37], offset); + w[62] = amd_bytealign_S (w[35], w[36], offset); + w[61] = amd_bytealign_S (w[34], w[35], offset); + w[60] = amd_bytealign_S (w[33], w[34], offset); + w[59] = amd_bytealign_S (w[32], w[33], offset); + w[58] = amd_bytealign_S (w[31], w[32], offset); + w[57] = amd_bytealign_S (w[30], w[31], offset); + w[56] = amd_bytealign_S (w[29], w[30], offset); + w[55] = amd_bytealign_S (w[28], w[29], offset); + w[54] = amd_bytealign_S (w[27], w[28], offset); + w[53] = amd_bytealign_S (w[26], w[27], offset); + w[52] = amd_bytealign_S (w[25], w[26], offset); + w[51] = amd_bytealign_S (w[24], w[25], offset); + w[50] = amd_bytealign_S (w[23], w[24], offset); + w[49] = amd_bytealign_S (w[22], w[23], offset); + w[48] = amd_bytealign_S (w[21], w[22], offset); + w[47] = amd_bytealign_S (w[20], w[21], offset); + w[46] = amd_bytealign_S (w[19], w[20], offset); + w[45] = amd_bytealign_S (w[18], w[19], offset); + w[44] = amd_bytealign_S (w[17], w[18], offset); + w[43] = amd_bytealign_S (w[16], w[17], offset); + w[42] = amd_bytealign_S (w[15], w[16], offset); + w[41] = amd_bytealign_S (w[14], w[15], offset); + w[40] = amd_bytealign_S (w[13], w[14], offset); + w[39] = amd_bytealign_S (w[12], w[13], offset); + w[38] = amd_bytealign_S (w[11], w[12], offset); + w[37] = amd_bytealign_S (w[10], w[11], offset); + w[36] = amd_bytealign_S (w[ 9], w[10], offset); + w[35] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[34] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[33] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[32] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[31] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[30] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[29] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[28] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[27] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[26] = amd_bytealign_S ( 0, w[ 0], offset); + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 27: + w[63] = amd_bytealign_S (w[35], w[36], offset); + w[62] = amd_bytealign_S (w[34], w[35], offset); + w[61] = amd_bytealign_S (w[33], w[34], offset); + w[60] = amd_bytealign_S (w[32], w[33], offset); + w[59] = amd_bytealign_S (w[31], w[32], offset); + w[58] = amd_bytealign_S (w[30], w[31], offset); + w[57] = amd_bytealign_S (w[29], w[30], offset); + w[56] = amd_bytealign_S (w[28], w[29], offset); + w[55] = amd_bytealign_S (w[27], w[28], offset); + w[54] = amd_bytealign_S (w[26], w[27], offset); + w[53] = amd_bytealign_S (w[25], w[26], offset); + w[52] = amd_bytealign_S (w[24], w[25], offset); + w[51] = amd_bytealign_S (w[23], w[24], offset); + w[50] = amd_bytealign_S (w[22], w[23], offset); + w[49] = amd_bytealign_S (w[21], w[22], offset); + w[48] = amd_bytealign_S (w[20], w[21], offset); + w[47] = amd_bytealign_S (w[19], w[20], offset); + w[46] = amd_bytealign_S (w[18], w[19], offset); + w[45] = amd_bytealign_S (w[17], w[18], offset); + w[44] = amd_bytealign_S (w[16], w[17], offset); + w[43] = amd_bytealign_S (w[15], w[16], offset); + w[42] = amd_bytealign_S (w[14], w[15], offset); + w[41] = amd_bytealign_S (w[13], w[14], offset); + w[40] = amd_bytealign_S (w[12], w[13], offset); + w[39] = amd_bytealign_S (w[11], w[12], offset); + w[38] = amd_bytealign_S (w[10], w[11], offset); + w[37] = amd_bytealign_S (w[ 9], w[10], offset); + w[36] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[35] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[34] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[33] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[32] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[31] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[30] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[29] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[28] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[27] = amd_bytealign_S ( 0, w[ 0], offset); + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 28: + w[63] = amd_bytealign_S (w[34], w[35], offset); + w[62] = amd_bytealign_S (w[33], w[34], offset); + w[61] = amd_bytealign_S (w[32], w[33], offset); + w[60] = amd_bytealign_S (w[31], w[32], offset); + w[59] = amd_bytealign_S (w[30], w[31], offset); + w[58] = amd_bytealign_S (w[29], w[30], offset); + w[57] = amd_bytealign_S (w[28], w[29], offset); + w[56] = amd_bytealign_S (w[27], w[28], offset); + w[55] = amd_bytealign_S (w[26], w[27], offset); + w[54] = amd_bytealign_S (w[25], w[26], offset); + w[53] = amd_bytealign_S (w[24], w[25], offset); + w[52] = amd_bytealign_S (w[23], w[24], offset); + w[51] = amd_bytealign_S (w[22], w[23], offset); + w[50] = amd_bytealign_S (w[21], w[22], offset); + w[49] = amd_bytealign_S (w[20], w[21], offset); + w[48] = amd_bytealign_S (w[19], w[20], offset); + w[47] = amd_bytealign_S (w[18], w[19], offset); + w[46] = amd_bytealign_S (w[17], w[18], offset); + w[45] = amd_bytealign_S (w[16], w[17], offset); + w[44] = amd_bytealign_S (w[15], w[16], offset); + w[43] = amd_bytealign_S (w[14], w[15], offset); + w[42] = amd_bytealign_S (w[13], w[14], offset); + w[41] = amd_bytealign_S (w[12], w[13], offset); + w[40] = amd_bytealign_S (w[11], w[12], offset); + w[39] = amd_bytealign_S (w[10], w[11], offset); + w[38] = amd_bytealign_S (w[ 9], w[10], offset); + w[37] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[36] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[35] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[34] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[33] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[32] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[31] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[30] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[29] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[28] = amd_bytealign_S ( 0, w[ 0], offset); + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 29: + w[63] = amd_bytealign_S (w[33], w[34], offset); + w[62] = amd_bytealign_S (w[32], w[33], offset); + w[61] = amd_bytealign_S (w[31], w[32], offset); + w[60] = amd_bytealign_S (w[30], w[31], offset); + w[59] = amd_bytealign_S (w[29], w[30], offset); + w[58] = amd_bytealign_S (w[28], w[29], offset); + w[57] = amd_bytealign_S (w[27], w[28], offset); + w[56] = amd_bytealign_S (w[26], w[27], offset); + w[55] = amd_bytealign_S (w[25], w[26], offset); + w[54] = amd_bytealign_S (w[24], w[25], offset); + w[53] = amd_bytealign_S (w[23], w[24], offset); + w[52] = amd_bytealign_S (w[22], w[23], offset); + w[51] = amd_bytealign_S (w[21], w[22], offset); + w[50] = amd_bytealign_S (w[20], w[21], offset); + w[49] = amd_bytealign_S (w[19], w[20], offset); + w[48] = amd_bytealign_S (w[18], w[19], offset); + w[47] = amd_bytealign_S (w[17], w[18], offset); + w[46] = amd_bytealign_S (w[16], w[17], offset); + w[45] = amd_bytealign_S (w[15], w[16], offset); + w[44] = amd_bytealign_S (w[14], w[15], offset); + w[43] = amd_bytealign_S (w[13], w[14], offset); + w[42] = amd_bytealign_S (w[12], w[13], offset); + w[41] = amd_bytealign_S (w[11], w[12], offset); + w[40] = amd_bytealign_S (w[10], w[11], offset); + w[39] = amd_bytealign_S (w[ 9], w[10], offset); + w[38] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[37] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[36] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[35] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[34] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[33] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[32] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[31] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[30] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[29] = amd_bytealign_S ( 0, w[ 0], offset); + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 30: + w[63] = amd_bytealign_S (w[32], w[33], offset); + w[62] = amd_bytealign_S (w[31], w[32], offset); + w[61] = amd_bytealign_S (w[30], w[31], offset); + w[60] = amd_bytealign_S (w[29], w[30], offset); + w[59] = amd_bytealign_S (w[28], w[29], offset); + w[58] = amd_bytealign_S (w[27], w[28], offset); + w[57] = amd_bytealign_S (w[26], w[27], offset); + w[56] = amd_bytealign_S (w[25], w[26], offset); + w[55] = amd_bytealign_S (w[24], w[25], offset); + w[54] = amd_bytealign_S (w[23], w[24], offset); + w[53] = amd_bytealign_S (w[22], w[23], offset); + w[52] = amd_bytealign_S (w[21], w[22], offset); + w[51] = amd_bytealign_S (w[20], w[21], offset); + w[50] = amd_bytealign_S (w[19], w[20], offset); + w[49] = amd_bytealign_S (w[18], w[19], offset); + w[48] = amd_bytealign_S (w[17], w[18], offset); + w[47] = amd_bytealign_S (w[16], w[17], offset); + w[46] = amd_bytealign_S (w[15], w[16], offset); + w[45] = amd_bytealign_S (w[14], w[15], offset); + w[44] = amd_bytealign_S (w[13], w[14], offset); + w[43] = amd_bytealign_S (w[12], w[13], offset); + w[42] = amd_bytealign_S (w[11], w[12], offset); + w[41] = amd_bytealign_S (w[10], w[11], offset); + w[40] = amd_bytealign_S (w[ 9], w[10], offset); + w[39] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[38] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[37] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[36] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[35] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[34] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[33] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[32] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[31] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[30] = amd_bytealign_S ( 0, w[ 0], offset); + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 31: + w[63] = amd_bytealign_S (w[31], w[32], offset); + w[62] = amd_bytealign_S (w[30], w[31], offset); + w[61] = amd_bytealign_S (w[29], w[30], offset); + w[60] = amd_bytealign_S (w[28], w[29], offset); + w[59] = amd_bytealign_S (w[27], w[28], offset); + w[58] = amd_bytealign_S (w[26], w[27], offset); + w[57] = amd_bytealign_S (w[25], w[26], offset); + w[56] = amd_bytealign_S (w[24], w[25], offset); + w[55] = amd_bytealign_S (w[23], w[24], offset); + w[54] = amd_bytealign_S (w[22], w[23], offset); + w[53] = amd_bytealign_S (w[21], w[22], offset); + w[52] = amd_bytealign_S (w[20], w[21], offset); + w[51] = amd_bytealign_S (w[19], w[20], offset); + w[50] = amd_bytealign_S (w[18], w[19], offset); + w[49] = amd_bytealign_S (w[17], w[18], offset); + w[48] = amd_bytealign_S (w[16], w[17], offset); + w[47] = amd_bytealign_S (w[15], w[16], offset); + w[46] = amd_bytealign_S (w[14], w[15], offset); + w[45] = amd_bytealign_S (w[13], w[14], offset); + w[44] = amd_bytealign_S (w[12], w[13], offset); + w[43] = amd_bytealign_S (w[11], w[12], offset); + w[42] = amd_bytealign_S (w[10], w[11], offset); + w[41] = amd_bytealign_S (w[ 9], w[10], offset); + w[40] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[39] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[38] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[37] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[36] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[35] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[34] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[33] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[32] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[31] = amd_bytealign_S ( 0, w[ 0], offset); + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 32: + w[63] = amd_bytealign_S (w[30], w[31], offset); + w[62] = amd_bytealign_S (w[29], w[30], offset); + w[61] = amd_bytealign_S (w[28], w[29], offset); + w[60] = amd_bytealign_S (w[27], w[28], offset); + w[59] = amd_bytealign_S (w[26], w[27], offset); + w[58] = amd_bytealign_S (w[25], w[26], offset); + w[57] = amd_bytealign_S (w[24], w[25], offset); + w[56] = amd_bytealign_S (w[23], w[24], offset); + w[55] = amd_bytealign_S (w[22], w[23], offset); + w[54] = amd_bytealign_S (w[21], w[22], offset); + w[53] = amd_bytealign_S (w[20], w[21], offset); + w[52] = amd_bytealign_S (w[19], w[20], offset); + w[51] = amd_bytealign_S (w[18], w[19], offset); + w[50] = amd_bytealign_S (w[17], w[18], offset); + w[49] = amd_bytealign_S (w[16], w[17], offset); + w[48] = amd_bytealign_S (w[15], w[16], offset); + w[47] = amd_bytealign_S (w[14], w[15], offset); + w[46] = amd_bytealign_S (w[13], w[14], offset); + w[45] = amd_bytealign_S (w[12], w[13], offset); + w[44] = amd_bytealign_S (w[11], w[12], offset); + w[43] = amd_bytealign_S (w[10], w[11], offset); + w[42] = amd_bytealign_S (w[ 9], w[10], offset); + w[41] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[40] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[39] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[38] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[37] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[36] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[35] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[34] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[33] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[32] = amd_bytealign_S ( 0, w[ 0], offset); + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 33: + w[63] = amd_bytealign_S (w[29], w[30], offset); + w[62] = amd_bytealign_S (w[28], w[29], offset); + w[61] = amd_bytealign_S (w[27], w[28], offset); + w[60] = amd_bytealign_S (w[26], w[27], offset); + w[59] = amd_bytealign_S (w[25], w[26], offset); + w[58] = amd_bytealign_S (w[24], w[25], offset); + w[57] = amd_bytealign_S (w[23], w[24], offset); + w[56] = amd_bytealign_S (w[22], w[23], offset); + w[55] = amd_bytealign_S (w[21], w[22], offset); + w[54] = amd_bytealign_S (w[20], w[21], offset); + w[53] = amd_bytealign_S (w[19], w[20], offset); + w[52] = amd_bytealign_S (w[18], w[19], offset); + w[51] = amd_bytealign_S (w[17], w[18], offset); + w[50] = amd_bytealign_S (w[16], w[17], offset); + w[49] = amd_bytealign_S (w[15], w[16], offset); + w[48] = amd_bytealign_S (w[14], w[15], offset); + w[47] = amd_bytealign_S (w[13], w[14], offset); + w[46] = amd_bytealign_S (w[12], w[13], offset); + w[45] = amd_bytealign_S (w[11], w[12], offset); + w[44] = amd_bytealign_S (w[10], w[11], offset); + w[43] = amd_bytealign_S (w[ 9], w[10], offset); + w[42] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[41] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[40] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[39] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[38] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[37] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[36] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[35] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[34] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[33] = amd_bytealign_S ( 0, w[ 0], offset); + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 34: + w[63] = amd_bytealign_S (w[28], w[29], offset); + w[62] = amd_bytealign_S (w[27], w[28], offset); + w[61] = amd_bytealign_S (w[26], w[27], offset); + w[60] = amd_bytealign_S (w[25], w[26], offset); + w[59] = amd_bytealign_S (w[24], w[25], offset); + w[58] = amd_bytealign_S (w[23], w[24], offset); + w[57] = amd_bytealign_S (w[22], w[23], offset); + w[56] = amd_bytealign_S (w[21], w[22], offset); + w[55] = amd_bytealign_S (w[20], w[21], offset); + w[54] = amd_bytealign_S (w[19], w[20], offset); + w[53] = amd_bytealign_S (w[18], w[19], offset); + w[52] = amd_bytealign_S (w[17], w[18], offset); + w[51] = amd_bytealign_S (w[16], w[17], offset); + w[50] = amd_bytealign_S (w[15], w[16], offset); + w[49] = amd_bytealign_S (w[14], w[15], offset); + w[48] = amd_bytealign_S (w[13], w[14], offset); + w[47] = amd_bytealign_S (w[12], w[13], offset); + w[46] = amd_bytealign_S (w[11], w[12], offset); + w[45] = amd_bytealign_S (w[10], w[11], offset); + w[44] = amd_bytealign_S (w[ 9], w[10], offset); + w[43] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[42] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[41] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[40] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[39] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[38] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[37] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[36] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[35] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[34] = amd_bytealign_S ( 0, w[ 0], offset); + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 35: + w[63] = amd_bytealign_S (w[27], w[28], offset); + w[62] = amd_bytealign_S (w[26], w[27], offset); + w[61] = amd_bytealign_S (w[25], w[26], offset); + w[60] = amd_bytealign_S (w[24], w[25], offset); + w[59] = amd_bytealign_S (w[23], w[24], offset); + w[58] = amd_bytealign_S (w[22], w[23], offset); + w[57] = amd_bytealign_S (w[21], w[22], offset); + w[56] = amd_bytealign_S (w[20], w[21], offset); + w[55] = amd_bytealign_S (w[19], w[20], offset); + w[54] = amd_bytealign_S (w[18], w[19], offset); + w[53] = amd_bytealign_S (w[17], w[18], offset); + w[52] = amd_bytealign_S (w[16], w[17], offset); + w[51] = amd_bytealign_S (w[15], w[16], offset); + w[50] = amd_bytealign_S (w[14], w[15], offset); + w[49] = amd_bytealign_S (w[13], w[14], offset); + w[48] = amd_bytealign_S (w[12], w[13], offset); + w[47] = amd_bytealign_S (w[11], w[12], offset); + w[46] = amd_bytealign_S (w[10], w[11], offset); + w[45] = amd_bytealign_S (w[ 9], w[10], offset); + w[44] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[43] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[42] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[41] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[40] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[39] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[38] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[37] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[36] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[35] = amd_bytealign_S ( 0, w[ 0], offset); + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 36: + w[63] = amd_bytealign_S (w[26], w[27], offset); + w[62] = amd_bytealign_S (w[25], w[26], offset); + w[61] = amd_bytealign_S (w[24], w[25], offset); + w[60] = amd_bytealign_S (w[23], w[24], offset); + w[59] = amd_bytealign_S (w[22], w[23], offset); + w[58] = amd_bytealign_S (w[21], w[22], offset); + w[57] = amd_bytealign_S (w[20], w[21], offset); + w[56] = amd_bytealign_S (w[19], w[20], offset); + w[55] = amd_bytealign_S (w[18], w[19], offset); + w[54] = amd_bytealign_S (w[17], w[18], offset); + w[53] = amd_bytealign_S (w[16], w[17], offset); + w[52] = amd_bytealign_S (w[15], w[16], offset); + w[51] = amd_bytealign_S (w[14], w[15], offset); + w[50] = amd_bytealign_S (w[13], w[14], offset); + w[49] = amd_bytealign_S (w[12], w[13], offset); + w[48] = amd_bytealign_S (w[11], w[12], offset); + w[47] = amd_bytealign_S (w[10], w[11], offset); + w[46] = amd_bytealign_S (w[ 9], w[10], offset); + w[45] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[44] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[43] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[42] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[41] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[40] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[39] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[38] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[37] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[36] = amd_bytealign_S ( 0, w[ 0], offset); + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 37: + w[63] = amd_bytealign_S (w[25], w[26], offset); + w[62] = amd_bytealign_S (w[24], w[25], offset); + w[61] = amd_bytealign_S (w[23], w[24], offset); + w[60] = amd_bytealign_S (w[22], w[23], offset); + w[59] = amd_bytealign_S (w[21], w[22], offset); + w[58] = amd_bytealign_S (w[20], w[21], offset); + w[57] = amd_bytealign_S (w[19], w[20], offset); + w[56] = amd_bytealign_S (w[18], w[19], offset); + w[55] = amd_bytealign_S (w[17], w[18], offset); + w[54] = amd_bytealign_S (w[16], w[17], offset); + w[53] = amd_bytealign_S (w[15], w[16], offset); + w[52] = amd_bytealign_S (w[14], w[15], offset); + w[51] = amd_bytealign_S (w[13], w[14], offset); + w[50] = amd_bytealign_S (w[12], w[13], offset); + w[49] = amd_bytealign_S (w[11], w[12], offset); + w[48] = amd_bytealign_S (w[10], w[11], offset); + w[47] = amd_bytealign_S (w[ 9], w[10], offset); + w[46] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[45] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[44] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[43] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[42] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[41] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[40] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[39] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[38] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[37] = amd_bytealign_S ( 0, w[ 0], offset); + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 38: + w[63] = amd_bytealign_S (w[24], w[25], offset); + w[62] = amd_bytealign_S (w[23], w[24], offset); + w[61] = amd_bytealign_S (w[22], w[23], offset); + w[60] = amd_bytealign_S (w[21], w[22], offset); + w[59] = amd_bytealign_S (w[20], w[21], offset); + w[58] = amd_bytealign_S (w[19], w[20], offset); + w[57] = amd_bytealign_S (w[18], w[19], offset); + w[56] = amd_bytealign_S (w[17], w[18], offset); + w[55] = amd_bytealign_S (w[16], w[17], offset); + w[54] = amd_bytealign_S (w[15], w[16], offset); + w[53] = amd_bytealign_S (w[14], w[15], offset); + w[52] = amd_bytealign_S (w[13], w[14], offset); + w[51] = amd_bytealign_S (w[12], w[13], offset); + w[50] = amd_bytealign_S (w[11], w[12], offset); + w[49] = amd_bytealign_S (w[10], w[11], offset); + w[48] = amd_bytealign_S (w[ 9], w[10], offset); + w[47] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[46] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[45] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[44] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[43] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[42] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[41] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[40] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[39] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[38] = amd_bytealign_S ( 0, w[ 0], offset); + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 39: + w[63] = amd_bytealign_S (w[23], w[24], offset); + w[62] = amd_bytealign_S (w[22], w[23], offset); + w[61] = amd_bytealign_S (w[21], w[22], offset); + w[60] = amd_bytealign_S (w[20], w[21], offset); + w[59] = amd_bytealign_S (w[19], w[20], offset); + w[58] = amd_bytealign_S (w[18], w[19], offset); + w[57] = amd_bytealign_S (w[17], w[18], offset); + w[56] = amd_bytealign_S (w[16], w[17], offset); + w[55] = amd_bytealign_S (w[15], w[16], offset); + w[54] = amd_bytealign_S (w[14], w[15], offset); + w[53] = amd_bytealign_S (w[13], w[14], offset); + w[52] = amd_bytealign_S (w[12], w[13], offset); + w[51] = amd_bytealign_S (w[11], w[12], offset); + w[50] = amd_bytealign_S (w[10], w[11], offset); + w[49] = amd_bytealign_S (w[ 9], w[10], offset); + w[48] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[47] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[46] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[45] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[44] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[43] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[42] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[41] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[40] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[39] = amd_bytealign_S ( 0, w[ 0], offset); + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 40: + w[63] = amd_bytealign_S (w[22], w[23], offset); + w[62] = amd_bytealign_S (w[21], w[22], offset); + w[61] = amd_bytealign_S (w[20], w[21], offset); + w[60] = amd_bytealign_S (w[19], w[20], offset); + w[59] = amd_bytealign_S (w[18], w[19], offset); + w[58] = amd_bytealign_S (w[17], w[18], offset); + w[57] = amd_bytealign_S (w[16], w[17], offset); + w[56] = amd_bytealign_S (w[15], w[16], offset); + w[55] = amd_bytealign_S (w[14], w[15], offset); + w[54] = amd_bytealign_S (w[13], w[14], offset); + w[53] = amd_bytealign_S (w[12], w[13], offset); + w[52] = amd_bytealign_S (w[11], w[12], offset); + w[51] = amd_bytealign_S (w[10], w[11], offset); + w[50] = amd_bytealign_S (w[ 9], w[10], offset); + w[49] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[48] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[47] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[46] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[45] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[44] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[43] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[42] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[41] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[40] = amd_bytealign_S ( 0, w[ 0], offset); + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 41: + w[63] = amd_bytealign_S (w[21], w[22], offset); + w[62] = amd_bytealign_S (w[20], w[21], offset); + w[61] = amd_bytealign_S (w[19], w[20], offset); + w[60] = amd_bytealign_S (w[18], w[19], offset); + w[59] = amd_bytealign_S (w[17], w[18], offset); + w[58] = amd_bytealign_S (w[16], w[17], offset); + w[57] = amd_bytealign_S (w[15], w[16], offset); + w[56] = amd_bytealign_S (w[14], w[15], offset); + w[55] = amd_bytealign_S (w[13], w[14], offset); + w[54] = amd_bytealign_S (w[12], w[13], offset); + w[53] = amd_bytealign_S (w[11], w[12], offset); + w[52] = amd_bytealign_S (w[10], w[11], offset); + w[51] = amd_bytealign_S (w[ 9], w[10], offset); + w[50] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[49] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[48] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[47] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[46] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[45] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[44] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[43] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[42] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[41] = amd_bytealign_S ( 0, w[ 0], offset); + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 42: + w[63] = amd_bytealign_S (w[20], w[21], offset); + w[62] = amd_bytealign_S (w[19], w[20], offset); + w[61] = amd_bytealign_S (w[18], w[19], offset); + w[60] = amd_bytealign_S (w[17], w[18], offset); + w[59] = amd_bytealign_S (w[16], w[17], offset); + w[58] = amd_bytealign_S (w[15], w[16], offset); + w[57] = amd_bytealign_S (w[14], w[15], offset); + w[56] = amd_bytealign_S (w[13], w[14], offset); + w[55] = amd_bytealign_S (w[12], w[13], offset); + w[54] = amd_bytealign_S (w[11], w[12], offset); + w[53] = amd_bytealign_S (w[10], w[11], offset); + w[52] = amd_bytealign_S (w[ 9], w[10], offset); + w[51] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[50] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[49] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[48] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[47] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[46] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[45] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[44] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[43] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[42] = amd_bytealign_S ( 0, w[ 0], offset); + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 43: + w[63] = amd_bytealign_S (w[19], w[20], offset); + w[62] = amd_bytealign_S (w[18], w[19], offset); + w[61] = amd_bytealign_S (w[17], w[18], offset); + w[60] = amd_bytealign_S (w[16], w[17], offset); + w[59] = amd_bytealign_S (w[15], w[16], offset); + w[58] = amd_bytealign_S (w[14], w[15], offset); + w[57] = amd_bytealign_S (w[13], w[14], offset); + w[56] = amd_bytealign_S (w[12], w[13], offset); + w[55] = amd_bytealign_S (w[11], w[12], offset); + w[54] = amd_bytealign_S (w[10], w[11], offset); + w[53] = amd_bytealign_S (w[ 9], w[10], offset); + w[52] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[51] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[50] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[49] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[48] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[47] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[46] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[45] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[44] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[43] = amd_bytealign_S ( 0, w[ 0], offset); + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 44: + w[63] = amd_bytealign_S (w[18], w[19], offset); + w[62] = amd_bytealign_S (w[17], w[18], offset); + w[61] = amd_bytealign_S (w[16], w[17], offset); + w[60] = amd_bytealign_S (w[15], w[16], offset); + w[59] = amd_bytealign_S (w[14], w[15], offset); + w[58] = amd_bytealign_S (w[13], w[14], offset); + w[57] = amd_bytealign_S (w[12], w[13], offset); + w[56] = amd_bytealign_S (w[11], w[12], offset); + w[55] = amd_bytealign_S (w[10], w[11], offset); + w[54] = amd_bytealign_S (w[ 9], w[10], offset); + w[53] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[52] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[51] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[50] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[49] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[48] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[47] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[46] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[45] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[44] = amd_bytealign_S ( 0, w[ 0], offset); + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 45: + w[63] = amd_bytealign_S (w[17], w[18], offset); + w[62] = amd_bytealign_S (w[16], w[17], offset); + w[61] = amd_bytealign_S (w[15], w[16], offset); + w[60] = amd_bytealign_S (w[14], w[15], offset); + w[59] = amd_bytealign_S (w[13], w[14], offset); + w[58] = amd_bytealign_S (w[12], w[13], offset); + w[57] = amd_bytealign_S (w[11], w[12], offset); + w[56] = amd_bytealign_S (w[10], w[11], offset); + w[55] = amd_bytealign_S (w[ 9], w[10], offset); + w[54] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[53] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[52] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[51] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[50] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[49] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[48] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[47] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[46] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[45] = amd_bytealign_S ( 0, w[ 0], offset); + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 46: + w[63] = amd_bytealign_S (w[16], w[17], offset); + w[62] = amd_bytealign_S (w[15], w[16], offset); + w[61] = amd_bytealign_S (w[14], w[15], offset); + w[60] = amd_bytealign_S (w[13], w[14], offset); + w[59] = amd_bytealign_S (w[12], w[13], offset); + w[58] = amd_bytealign_S (w[11], w[12], offset); + w[57] = amd_bytealign_S (w[10], w[11], offset); + w[56] = amd_bytealign_S (w[ 9], w[10], offset); + w[55] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[54] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[53] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[52] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[51] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[50] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[49] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[48] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[47] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[46] = amd_bytealign_S ( 0, w[ 0], offset); + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 47: + w[63] = amd_bytealign_S (w[15], w[16], offset); + w[62] = amd_bytealign_S (w[14], w[15], offset); + w[61] = amd_bytealign_S (w[13], w[14], offset); + w[60] = amd_bytealign_S (w[12], w[13], offset); + w[59] = amd_bytealign_S (w[11], w[12], offset); + w[58] = amd_bytealign_S (w[10], w[11], offset); + w[57] = amd_bytealign_S (w[ 9], w[10], offset); + w[56] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[55] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[54] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[53] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[52] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[51] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[50] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[49] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[48] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[47] = amd_bytealign_S ( 0, w[ 0], offset); + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 48: + w[63] = amd_bytealign_S (w[14], w[15], offset); + w[62] = amd_bytealign_S (w[13], w[14], offset); + w[61] = amd_bytealign_S (w[12], w[13], offset); + w[60] = amd_bytealign_S (w[11], w[12], offset); + w[59] = amd_bytealign_S (w[10], w[11], offset); + w[58] = amd_bytealign_S (w[ 9], w[10], offset); + w[57] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[56] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[55] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[54] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[53] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[52] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[51] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[50] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[49] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[48] = amd_bytealign_S ( 0, w[ 0], offset); + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 49: + w[63] = amd_bytealign_S (w[13], w[14], offset); + w[62] = amd_bytealign_S (w[12], w[13], offset); + w[61] = amd_bytealign_S (w[11], w[12], offset); + w[60] = amd_bytealign_S (w[10], w[11], offset); + w[59] = amd_bytealign_S (w[ 9], w[10], offset); + w[58] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[57] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[56] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[55] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[54] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[53] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[52] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[51] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[50] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[49] = amd_bytealign_S ( 0, w[ 0], offset); + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 50: + w[63] = amd_bytealign_S (w[12], w[13], offset); + w[62] = amd_bytealign_S (w[11], w[12], offset); + w[61] = amd_bytealign_S (w[10], w[11], offset); + w[60] = amd_bytealign_S (w[ 9], w[10], offset); + w[59] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[58] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[57] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[56] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[55] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[54] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[53] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[52] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[51] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[50] = amd_bytealign_S ( 0, w[ 0], offset); + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 51: + w[63] = amd_bytealign_S (w[11], w[12], offset); + w[62] = amd_bytealign_S (w[10], w[11], offset); + w[61] = amd_bytealign_S (w[ 9], w[10], offset); + w[60] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[59] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[58] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[57] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[56] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[55] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[54] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[53] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[52] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[51] = amd_bytealign_S ( 0, w[ 0], offset); + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 52: + w[63] = amd_bytealign_S (w[10], w[11], offset); + w[62] = amd_bytealign_S (w[ 9], w[10], offset); + w[61] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[60] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[59] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[58] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[57] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[56] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[55] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[54] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[53] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[52] = amd_bytealign_S ( 0, w[ 0], offset); + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 53: + w[63] = amd_bytealign_S (w[ 9], w[10], offset); + w[62] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[61] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[60] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[59] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[58] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[57] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[56] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[55] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[54] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[53] = amd_bytealign_S ( 0, w[ 0], offset); + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 54: + w[63] = amd_bytealign_S (w[ 8], w[ 9], offset); + w[62] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[61] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[60] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[59] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[58] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[57] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[56] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[55] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[54] = amd_bytealign_S ( 0, w[ 0], offset); + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 55: + w[63] = amd_bytealign_S (w[ 7], w[ 8], offset); + w[62] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[61] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[60] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[59] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[58] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[57] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[56] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[55] = amd_bytealign_S ( 0, w[ 0], offset); + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 56: + w[63] = amd_bytealign_S (w[ 6], w[ 7], offset); + w[62] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[61] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[60] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[59] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[58] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[57] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[56] = amd_bytealign_S ( 0, w[ 0], offset); + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 57: + w[63] = amd_bytealign_S (w[ 5], w[ 6], offset); + w[62] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[61] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[60] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[59] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[58] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[57] = amd_bytealign_S ( 0, w[ 0], offset); + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 58: + w[63] = amd_bytealign_S (w[ 4], w[ 5], offset); + w[62] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[61] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[60] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[59] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[58] = amd_bytealign_S ( 0, w[ 0], offset); + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 59: + w[63] = amd_bytealign_S (w[ 3], w[ 4], offset); + w[62] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[61] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[60] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[59] = amd_bytealign_S ( 0, w[ 0], offset); + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 60: + w[63] = amd_bytealign_S (w[ 2], w[ 3], offset); + w[62] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[61] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[60] = amd_bytealign_S ( 0, w[ 0], offset); + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 61: + w[63] = amd_bytealign_S (w[ 1], w[ 2], offset); + w[62] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[61] = amd_bytealign_S ( 0, w[ 0], offset); + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 62: + w[63] = amd_bytealign_S (w[ 0], w[ 1], offset); + w[62] = amd_bytealign_S ( 0, w[ 0], offset); + w[61] = 0; + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 63: + w[63] = amd_bytealign_S ( 0, w[ 0], offset); + w[62] = 0; + w[61] = 0; + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + } + #endif + + #ifdef IS_NV + const int offset_minus_4 = 4 - (offset % 4); + + const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; + + switch (offset / 4) + { + case 0: + w[63] = __byte_perm_S (w[63], w[62], selector); + w[62] = __byte_perm_S (w[62], w[61], selector); + w[61] = __byte_perm_S (w[61], w[60], selector); + w[60] = __byte_perm_S (w[60], w[59], selector); + w[59] = __byte_perm_S (w[59], w[58], selector); + w[58] = __byte_perm_S (w[58], w[57], selector); + w[57] = __byte_perm_S (w[57], w[56], selector); + w[56] = __byte_perm_S (w[56], w[55], selector); + w[55] = __byte_perm_S (w[55], w[54], selector); + w[54] = __byte_perm_S (w[54], w[53], selector); + w[53] = __byte_perm_S (w[53], w[52], selector); + w[52] = __byte_perm_S (w[52], w[51], selector); + w[51] = __byte_perm_S (w[51], w[50], selector); + w[50] = __byte_perm_S (w[50], w[49], selector); + w[49] = __byte_perm_S (w[49], w[48], selector); + w[48] = __byte_perm_S (w[48], w[47], selector); + w[47] = __byte_perm_S (w[47], w[46], selector); + w[46] = __byte_perm_S (w[46], w[45], selector); + w[45] = __byte_perm_S (w[45], w[44], selector); + w[44] = __byte_perm_S (w[44], w[43], selector); + w[43] = __byte_perm_S (w[43], w[42], selector); + w[42] = __byte_perm_S (w[42], w[41], selector); + w[41] = __byte_perm_S (w[41], w[40], selector); + w[40] = __byte_perm_S (w[40], w[39], selector); + w[39] = __byte_perm_S (w[39], w[38], selector); + w[38] = __byte_perm_S (w[38], w[37], selector); + w[37] = __byte_perm_S (w[37], w[36], selector); + w[36] = __byte_perm_S (w[36], w[35], selector); + w[35] = __byte_perm_S (w[35], w[34], selector); + w[34] = __byte_perm_S (w[34], w[33], selector); + w[33] = __byte_perm_S (w[33], w[32], selector); + w[32] = __byte_perm_S (w[32], w[31], selector); + w[31] = __byte_perm_S (w[31], w[30], selector); + w[30] = __byte_perm_S (w[30], w[29], selector); + w[29] = __byte_perm_S (w[29], w[28], selector); + w[28] = __byte_perm_S (w[28], w[27], selector); + w[27] = __byte_perm_S (w[27], w[26], selector); + w[26] = __byte_perm_S (w[26], w[25], selector); + w[25] = __byte_perm_S (w[25], w[24], selector); + w[24] = __byte_perm_S (w[24], w[23], selector); + w[23] = __byte_perm_S (w[23], w[22], selector); + w[22] = __byte_perm_S (w[22], w[21], selector); + w[21] = __byte_perm_S (w[21], w[20], selector); + w[20] = __byte_perm_S (w[20], w[19], selector); + w[19] = __byte_perm_S (w[19], w[18], selector); + w[18] = __byte_perm_S (w[18], w[17], selector); + w[17] = __byte_perm_S (w[17], w[16], selector); + w[16] = __byte_perm_S (w[16], w[15], selector); + w[15] = __byte_perm_S (w[15], w[14], selector); + w[14] = __byte_perm_S (w[14], w[13], selector); + w[13] = __byte_perm_S (w[13], w[12], selector); + w[12] = __byte_perm_S (w[12], w[11], selector); + w[11] = __byte_perm_S (w[11], w[10], selector); + w[10] = __byte_perm_S (w[10], w[ 9], selector); + w[ 9] = __byte_perm_S (w[ 9], w[ 8], selector); + w[ 8] = __byte_perm_S (w[ 8], w[ 7], selector); + w[ 7] = __byte_perm_S (w[ 7], w[ 6], selector); + w[ 6] = __byte_perm_S (w[ 6], w[ 5], selector); + w[ 5] = __byte_perm_S (w[ 5], w[ 4], selector); + w[ 4] = __byte_perm_S (w[ 4], w[ 3], selector); + w[ 3] = __byte_perm_S (w[ 3], w[ 2], selector); + w[ 2] = __byte_perm_S (w[ 2], w[ 1], selector); + w[ 1] = __byte_perm_S (w[ 1], w[ 0], selector); + w[ 0] = __byte_perm_S (w[ 0], 0, selector); + + break; + + case 1: + w[63] = __byte_perm_S (w[62], w[61], selector); + w[62] = __byte_perm_S (w[61], w[60], selector); + w[61] = __byte_perm_S (w[60], w[59], selector); + w[60] = __byte_perm_S (w[59], w[58], selector); + w[59] = __byte_perm_S (w[58], w[57], selector); + w[58] = __byte_perm_S (w[57], w[56], selector); + w[57] = __byte_perm_S (w[56], w[55], selector); + w[56] = __byte_perm_S (w[55], w[54], selector); + w[55] = __byte_perm_S (w[54], w[53], selector); + w[54] = __byte_perm_S (w[53], w[52], selector); + w[53] = __byte_perm_S (w[52], w[51], selector); + w[52] = __byte_perm_S (w[51], w[50], selector); + w[51] = __byte_perm_S (w[50], w[49], selector); + w[50] = __byte_perm_S (w[49], w[48], selector); + w[49] = __byte_perm_S (w[48], w[47], selector); + w[48] = __byte_perm_S (w[47], w[46], selector); + w[47] = __byte_perm_S (w[46], w[45], selector); + w[46] = __byte_perm_S (w[45], w[44], selector); + w[45] = __byte_perm_S (w[44], w[43], selector); + w[44] = __byte_perm_S (w[43], w[42], selector); + w[43] = __byte_perm_S (w[42], w[41], selector); + w[42] = __byte_perm_S (w[41], w[40], selector); + w[41] = __byte_perm_S (w[40], w[39], selector); + w[40] = __byte_perm_S (w[39], w[38], selector); + w[39] = __byte_perm_S (w[38], w[37], selector); + w[38] = __byte_perm_S (w[37], w[36], selector); + w[37] = __byte_perm_S (w[36], w[35], selector); + w[36] = __byte_perm_S (w[35], w[34], selector); + w[35] = __byte_perm_S (w[34], w[33], selector); + w[34] = __byte_perm_S (w[33], w[32], selector); + w[33] = __byte_perm_S (w[32], w[31], selector); + w[32] = __byte_perm_S (w[31], w[30], selector); + w[31] = __byte_perm_S (w[30], w[29], selector); + w[30] = __byte_perm_S (w[29], w[28], selector); + w[29] = __byte_perm_S (w[28], w[27], selector); + w[28] = __byte_perm_S (w[27], w[26], selector); + w[27] = __byte_perm_S (w[26], w[25], selector); + w[26] = __byte_perm_S (w[25], w[24], selector); + w[25] = __byte_perm_S (w[24], w[23], selector); + w[24] = __byte_perm_S (w[23], w[22], selector); + w[23] = __byte_perm_S (w[22], w[21], selector); + w[22] = __byte_perm_S (w[21], w[20], selector); + w[21] = __byte_perm_S (w[20], w[19], selector); + w[20] = __byte_perm_S (w[19], w[18], selector); + w[19] = __byte_perm_S (w[18], w[17], selector); + w[18] = __byte_perm_S (w[17], w[16], selector); + w[17] = __byte_perm_S (w[16], w[15], selector); + w[16] = __byte_perm_S (w[15], w[14], selector); + w[15] = __byte_perm_S (w[14], w[13], selector); + w[14] = __byte_perm_S (w[13], w[12], selector); + w[13] = __byte_perm_S (w[12], w[11], selector); + w[12] = __byte_perm_S (w[11], w[10], selector); + w[11] = __byte_perm_S (w[10], w[ 9], selector); + w[10] = __byte_perm_S (w[ 9], w[ 8], selector); + w[ 9] = __byte_perm_S (w[ 8], w[ 7], selector); + w[ 8] = __byte_perm_S (w[ 7], w[ 6], selector); + w[ 7] = __byte_perm_S (w[ 6], w[ 5], selector); + w[ 6] = __byte_perm_S (w[ 5], w[ 4], selector); + w[ 5] = __byte_perm_S (w[ 4], w[ 3], selector); + w[ 4] = __byte_perm_S (w[ 3], w[ 2], selector); + w[ 3] = __byte_perm_S (w[ 2], w[ 1], selector); + w[ 2] = __byte_perm_S (w[ 1], w[ 0], selector); + w[ 1] = __byte_perm_S (w[ 0], 0, selector); + w[ 0] = 0; + + break; + + case 2: + w[63] = __byte_perm_S (w[61], w[60], selector); + w[62] = __byte_perm_S (w[60], w[59], selector); + w[61] = __byte_perm_S (w[59], w[58], selector); + w[60] = __byte_perm_S (w[58], w[57], selector); + w[59] = __byte_perm_S (w[57], w[56], selector); + w[58] = __byte_perm_S (w[56], w[55], selector); + w[57] = __byte_perm_S (w[55], w[54], selector); + w[56] = __byte_perm_S (w[54], w[53], selector); + w[55] = __byte_perm_S (w[53], w[52], selector); + w[54] = __byte_perm_S (w[52], w[51], selector); + w[53] = __byte_perm_S (w[51], w[50], selector); + w[52] = __byte_perm_S (w[50], w[49], selector); + w[51] = __byte_perm_S (w[49], w[48], selector); + w[50] = __byte_perm_S (w[48], w[47], selector); + w[49] = __byte_perm_S (w[47], w[46], selector); + w[48] = __byte_perm_S (w[46], w[45], selector); + w[47] = __byte_perm_S (w[45], w[44], selector); + w[46] = __byte_perm_S (w[44], w[43], selector); + w[45] = __byte_perm_S (w[43], w[42], selector); + w[44] = __byte_perm_S (w[42], w[41], selector); + w[43] = __byte_perm_S (w[41], w[40], selector); + w[42] = __byte_perm_S (w[40], w[39], selector); + w[41] = __byte_perm_S (w[39], w[38], selector); + w[40] = __byte_perm_S (w[38], w[37], selector); + w[39] = __byte_perm_S (w[37], w[36], selector); + w[38] = __byte_perm_S (w[36], w[35], selector); + w[37] = __byte_perm_S (w[35], w[34], selector); + w[36] = __byte_perm_S (w[34], w[33], selector); + w[35] = __byte_perm_S (w[33], w[32], selector); + w[34] = __byte_perm_S (w[32], w[31], selector); + w[33] = __byte_perm_S (w[31], w[30], selector); + w[32] = __byte_perm_S (w[30], w[29], selector); + w[31] = __byte_perm_S (w[29], w[28], selector); + w[30] = __byte_perm_S (w[28], w[27], selector); + w[29] = __byte_perm_S (w[27], w[26], selector); + w[28] = __byte_perm_S (w[26], w[25], selector); + w[27] = __byte_perm_S (w[25], w[24], selector); + w[26] = __byte_perm_S (w[24], w[23], selector); + w[25] = __byte_perm_S (w[23], w[22], selector); + w[24] = __byte_perm_S (w[22], w[21], selector); + w[23] = __byte_perm_S (w[21], w[20], selector); + w[22] = __byte_perm_S (w[20], w[19], selector); + w[21] = __byte_perm_S (w[19], w[18], selector); + w[20] = __byte_perm_S (w[18], w[17], selector); + w[19] = __byte_perm_S (w[17], w[16], selector); + w[18] = __byte_perm_S (w[16], w[15], selector); + w[17] = __byte_perm_S (w[15], w[14], selector); + w[16] = __byte_perm_S (w[14], w[13], selector); + w[15] = __byte_perm_S (w[13], w[12], selector); + w[14] = __byte_perm_S (w[12], w[11], selector); + w[13] = __byte_perm_S (w[11], w[10], selector); + w[12] = __byte_perm_S (w[10], w[ 9], selector); + w[11] = __byte_perm_S (w[ 9], w[ 8], selector); + w[10] = __byte_perm_S (w[ 8], w[ 7], selector); + w[ 9] = __byte_perm_S (w[ 7], w[ 6], selector); + w[ 8] = __byte_perm_S (w[ 6], w[ 5], selector); + w[ 7] = __byte_perm_S (w[ 5], w[ 4], selector); + w[ 6] = __byte_perm_S (w[ 4], w[ 3], selector); + w[ 5] = __byte_perm_S (w[ 3], w[ 2], selector); + w[ 4] = __byte_perm_S (w[ 2], w[ 1], selector); + w[ 3] = __byte_perm_S (w[ 1], w[ 0], selector); + w[ 2] = __byte_perm_S (w[ 0], 0, selector); + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 3: + w[63] = __byte_perm_S (w[60], w[59], selector); + w[62] = __byte_perm_S (w[59], w[58], selector); + w[61] = __byte_perm_S (w[58], w[57], selector); + w[60] = __byte_perm_S (w[57], w[56], selector); + w[59] = __byte_perm_S (w[56], w[55], selector); + w[58] = __byte_perm_S (w[55], w[54], selector); + w[57] = __byte_perm_S (w[54], w[53], selector); + w[56] = __byte_perm_S (w[53], w[52], selector); + w[55] = __byte_perm_S (w[52], w[51], selector); + w[54] = __byte_perm_S (w[51], w[50], selector); + w[53] = __byte_perm_S (w[50], w[49], selector); + w[52] = __byte_perm_S (w[49], w[48], selector); + w[51] = __byte_perm_S (w[48], w[47], selector); + w[50] = __byte_perm_S (w[47], w[46], selector); + w[49] = __byte_perm_S (w[46], w[45], selector); + w[48] = __byte_perm_S (w[45], w[44], selector); + w[47] = __byte_perm_S (w[44], w[43], selector); + w[46] = __byte_perm_S (w[43], w[42], selector); + w[45] = __byte_perm_S (w[42], w[41], selector); + w[44] = __byte_perm_S (w[41], w[40], selector); + w[43] = __byte_perm_S (w[40], w[39], selector); + w[42] = __byte_perm_S (w[39], w[38], selector); + w[41] = __byte_perm_S (w[38], w[37], selector); + w[40] = __byte_perm_S (w[37], w[36], selector); + w[39] = __byte_perm_S (w[36], w[35], selector); + w[38] = __byte_perm_S (w[35], w[34], selector); + w[37] = __byte_perm_S (w[34], w[33], selector); + w[36] = __byte_perm_S (w[33], w[32], selector); + w[35] = __byte_perm_S (w[32], w[31], selector); + w[34] = __byte_perm_S (w[31], w[30], selector); + w[33] = __byte_perm_S (w[30], w[29], selector); + w[32] = __byte_perm_S (w[29], w[28], selector); + w[31] = __byte_perm_S (w[28], w[27], selector); + w[30] = __byte_perm_S (w[27], w[26], selector); + w[29] = __byte_perm_S (w[26], w[25], selector); + w[28] = __byte_perm_S (w[25], w[24], selector); + w[27] = __byte_perm_S (w[24], w[23], selector); + w[26] = __byte_perm_S (w[23], w[22], selector); + w[25] = __byte_perm_S (w[22], w[21], selector); + w[24] = __byte_perm_S (w[21], w[20], selector); + w[23] = __byte_perm_S (w[20], w[19], selector); + w[22] = __byte_perm_S (w[19], w[18], selector); + w[21] = __byte_perm_S (w[18], w[17], selector); + w[20] = __byte_perm_S (w[17], w[16], selector); + w[19] = __byte_perm_S (w[16], w[15], selector); + w[18] = __byte_perm_S (w[15], w[14], selector); + w[17] = __byte_perm_S (w[14], w[13], selector); + w[16] = __byte_perm_S (w[13], w[12], selector); + w[15] = __byte_perm_S (w[12], w[11], selector); + w[14] = __byte_perm_S (w[11], w[10], selector); + w[13] = __byte_perm_S (w[10], w[ 9], selector); + w[12] = __byte_perm_S (w[ 9], w[ 8], selector); + w[11] = __byte_perm_S (w[ 8], w[ 7], selector); + w[10] = __byte_perm_S (w[ 7], w[ 6], selector); + w[ 9] = __byte_perm_S (w[ 6], w[ 5], selector); + w[ 8] = __byte_perm_S (w[ 5], w[ 4], selector); + w[ 7] = __byte_perm_S (w[ 4], w[ 3], selector); + w[ 6] = __byte_perm_S (w[ 3], w[ 2], selector); + w[ 5] = __byte_perm_S (w[ 2], w[ 1], selector); + w[ 4] = __byte_perm_S (w[ 1], w[ 0], selector); + w[ 3] = __byte_perm_S (w[ 0], 0, selector); + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 4: + w[63] = __byte_perm_S (w[59], w[58], selector); + w[62] = __byte_perm_S (w[58], w[57], selector); + w[61] = __byte_perm_S (w[57], w[56], selector); + w[60] = __byte_perm_S (w[56], w[55], selector); + w[59] = __byte_perm_S (w[55], w[54], selector); + w[58] = __byte_perm_S (w[54], w[53], selector); + w[57] = __byte_perm_S (w[53], w[52], selector); + w[56] = __byte_perm_S (w[52], w[51], selector); + w[55] = __byte_perm_S (w[51], w[50], selector); + w[54] = __byte_perm_S (w[50], w[49], selector); + w[53] = __byte_perm_S (w[49], w[48], selector); + w[52] = __byte_perm_S (w[48], w[47], selector); + w[51] = __byte_perm_S (w[47], w[46], selector); + w[50] = __byte_perm_S (w[46], w[45], selector); + w[49] = __byte_perm_S (w[45], w[44], selector); + w[48] = __byte_perm_S (w[44], w[43], selector); + w[47] = __byte_perm_S (w[43], w[42], selector); + w[46] = __byte_perm_S (w[42], w[41], selector); + w[45] = __byte_perm_S (w[41], w[40], selector); + w[44] = __byte_perm_S (w[40], w[39], selector); + w[43] = __byte_perm_S (w[39], w[38], selector); + w[42] = __byte_perm_S (w[38], w[37], selector); + w[41] = __byte_perm_S (w[37], w[36], selector); + w[40] = __byte_perm_S (w[36], w[35], selector); + w[39] = __byte_perm_S (w[35], w[34], selector); + w[38] = __byte_perm_S (w[34], w[33], selector); + w[37] = __byte_perm_S (w[33], w[32], selector); + w[36] = __byte_perm_S (w[32], w[31], selector); + w[35] = __byte_perm_S (w[31], w[30], selector); + w[34] = __byte_perm_S (w[30], w[29], selector); + w[33] = __byte_perm_S (w[29], w[28], selector); + w[32] = __byte_perm_S (w[28], w[27], selector); + w[31] = __byte_perm_S (w[27], w[26], selector); + w[30] = __byte_perm_S (w[26], w[25], selector); + w[29] = __byte_perm_S (w[25], w[24], selector); + w[28] = __byte_perm_S (w[24], w[23], selector); + w[27] = __byte_perm_S (w[23], w[22], selector); + w[26] = __byte_perm_S (w[22], w[21], selector); + w[25] = __byte_perm_S (w[21], w[20], selector); + w[24] = __byte_perm_S (w[20], w[19], selector); + w[23] = __byte_perm_S (w[19], w[18], selector); + w[22] = __byte_perm_S (w[18], w[17], selector); + w[21] = __byte_perm_S (w[17], w[16], selector); + w[20] = __byte_perm_S (w[16], w[15], selector); + w[19] = __byte_perm_S (w[15], w[14], selector); + w[18] = __byte_perm_S (w[14], w[13], selector); + w[17] = __byte_perm_S (w[13], w[12], selector); + w[16] = __byte_perm_S (w[12], w[11], selector); + w[15] = __byte_perm_S (w[11], w[10], selector); + w[14] = __byte_perm_S (w[10], w[ 9], selector); + w[13] = __byte_perm_S (w[ 9], w[ 8], selector); + w[12] = __byte_perm_S (w[ 8], w[ 7], selector); + w[11] = __byte_perm_S (w[ 7], w[ 6], selector); + w[10] = __byte_perm_S (w[ 6], w[ 5], selector); + w[ 9] = __byte_perm_S (w[ 5], w[ 4], selector); + w[ 8] = __byte_perm_S (w[ 4], w[ 3], selector); + w[ 7] = __byte_perm_S (w[ 3], w[ 2], selector); + w[ 6] = __byte_perm_S (w[ 2], w[ 1], selector); + w[ 5] = __byte_perm_S (w[ 1], w[ 0], selector); + w[ 4] = __byte_perm_S (w[ 0], 0, selector); + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 5: + w[63] = __byte_perm_S (w[58], w[57], selector); + w[62] = __byte_perm_S (w[57], w[56], selector); + w[61] = __byte_perm_S (w[56], w[55], selector); + w[60] = __byte_perm_S (w[55], w[54], selector); + w[59] = __byte_perm_S (w[54], w[53], selector); + w[58] = __byte_perm_S (w[53], w[52], selector); + w[57] = __byte_perm_S (w[52], w[51], selector); + w[56] = __byte_perm_S (w[51], w[50], selector); + w[55] = __byte_perm_S (w[50], w[49], selector); + w[54] = __byte_perm_S (w[49], w[48], selector); + w[53] = __byte_perm_S (w[48], w[47], selector); + w[52] = __byte_perm_S (w[47], w[46], selector); + w[51] = __byte_perm_S (w[46], w[45], selector); + w[50] = __byte_perm_S (w[45], w[44], selector); + w[49] = __byte_perm_S (w[44], w[43], selector); + w[48] = __byte_perm_S (w[43], w[42], selector); + w[47] = __byte_perm_S (w[42], w[41], selector); + w[46] = __byte_perm_S (w[41], w[40], selector); + w[45] = __byte_perm_S (w[40], w[39], selector); + w[44] = __byte_perm_S (w[39], w[38], selector); + w[43] = __byte_perm_S (w[38], w[37], selector); + w[42] = __byte_perm_S (w[37], w[36], selector); + w[41] = __byte_perm_S (w[36], w[35], selector); + w[40] = __byte_perm_S (w[35], w[34], selector); + w[39] = __byte_perm_S (w[34], w[33], selector); + w[38] = __byte_perm_S (w[33], w[32], selector); + w[37] = __byte_perm_S (w[32], w[31], selector); + w[36] = __byte_perm_S (w[31], w[30], selector); + w[35] = __byte_perm_S (w[30], w[29], selector); + w[34] = __byte_perm_S (w[29], w[28], selector); + w[33] = __byte_perm_S (w[28], w[27], selector); + w[32] = __byte_perm_S (w[27], w[26], selector); + w[31] = __byte_perm_S (w[26], w[25], selector); + w[30] = __byte_perm_S (w[25], w[24], selector); + w[29] = __byte_perm_S (w[24], w[23], selector); + w[28] = __byte_perm_S (w[23], w[22], selector); + w[27] = __byte_perm_S (w[22], w[21], selector); + w[26] = __byte_perm_S (w[21], w[20], selector); + w[25] = __byte_perm_S (w[20], w[19], selector); + w[24] = __byte_perm_S (w[19], w[18], selector); + w[23] = __byte_perm_S (w[18], w[17], selector); + w[22] = __byte_perm_S (w[17], w[16], selector); + w[21] = __byte_perm_S (w[16], w[15], selector); + w[20] = __byte_perm_S (w[15], w[14], selector); + w[19] = __byte_perm_S (w[14], w[13], selector); + w[18] = __byte_perm_S (w[13], w[12], selector); + w[17] = __byte_perm_S (w[12], w[11], selector); + w[16] = __byte_perm_S (w[11], w[10], selector); + w[15] = __byte_perm_S (w[10], w[ 9], selector); + w[14] = __byte_perm_S (w[ 9], w[ 8], selector); + w[13] = __byte_perm_S (w[ 8], w[ 7], selector); + w[12] = __byte_perm_S (w[ 7], w[ 6], selector); + w[11] = __byte_perm_S (w[ 6], w[ 5], selector); + w[10] = __byte_perm_S (w[ 5], w[ 4], selector); + w[ 9] = __byte_perm_S (w[ 4], w[ 3], selector); + w[ 8] = __byte_perm_S (w[ 3], w[ 2], selector); + w[ 7] = __byte_perm_S (w[ 2], w[ 1], selector); + w[ 6] = __byte_perm_S (w[ 1], w[ 0], selector); + w[ 5] = __byte_perm_S (w[ 0], 0, selector); + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 6: + w[63] = __byte_perm_S (w[57], w[56], selector); + w[62] = __byte_perm_S (w[56], w[55], selector); + w[61] = __byte_perm_S (w[55], w[54], selector); + w[60] = __byte_perm_S (w[54], w[53], selector); + w[59] = __byte_perm_S (w[53], w[52], selector); + w[58] = __byte_perm_S (w[52], w[51], selector); + w[57] = __byte_perm_S (w[51], w[50], selector); + w[56] = __byte_perm_S (w[50], w[49], selector); + w[55] = __byte_perm_S (w[49], w[48], selector); + w[54] = __byte_perm_S (w[48], w[47], selector); + w[53] = __byte_perm_S (w[47], w[46], selector); + w[52] = __byte_perm_S (w[46], w[45], selector); + w[51] = __byte_perm_S (w[45], w[44], selector); + w[50] = __byte_perm_S (w[44], w[43], selector); + w[49] = __byte_perm_S (w[43], w[42], selector); + w[48] = __byte_perm_S (w[42], w[41], selector); + w[47] = __byte_perm_S (w[41], w[40], selector); + w[46] = __byte_perm_S (w[40], w[39], selector); + w[45] = __byte_perm_S (w[39], w[38], selector); + w[44] = __byte_perm_S (w[38], w[37], selector); + w[43] = __byte_perm_S (w[37], w[36], selector); + w[42] = __byte_perm_S (w[36], w[35], selector); + w[41] = __byte_perm_S (w[35], w[34], selector); + w[40] = __byte_perm_S (w[34], w[33], selector); + w[39] = __byte_perm_S (w[33], w[32], selector); + w[38] = __byte_perm_S (w[32], w[31], selector); + w[37] = __byte_perm_S (w[31], w[30], selector); + w[36] = __byte_perm_S (w[30], w[29], selector); + w[35] = __byte_perm_S (w[29], w[28], selector); + w[34] = __byte_perm_S (w[28], w[27], selector); + w[33] = __byte_perm_S (w[27], w[26], selector); + w[32] = __byte_perm_S (w[26], w[25], selector); + w[31] = __byte_perm_S (w[25], w[24], selector); + w[30] = __byte_perm_S (w[24], w[23], selector); + w[29] = __byte_perm_S (w[23], w[22], selector); + w[28] = __byte_perm_S (w[22], w[21], selector); + w[27] = __byte_perm_S (w[21], w[20], selector); + w[26] = __byte_perm_S (w[20], w[19], selector); + w[25] = __byte_perm_S (w[19], w[18], selector); + w[24] = __byte_perm_S (w[18], w[17], selector); + w[23] = __byte_perm_S (w[17], w[16], selector); + w[22] = __byte_perm_S (w[16], w[15], selector); + w[21] = __byte_perm_S (w[15], w[14], selector); + w[20] = __byte_perm_S (w[14], w[13], selector); + w[19] = __byte_perm_S (w[13], w[12], selector); + w[18] = __byte_perm_S (w[12], w[11], selector); + w[17] = __byte_perm_S (w[11], w[10], selector); + w[16] = __byte_perm_S (w[10], w[ 9], selector); + w[15] = __byte_perm_S (w[ 9], w[ 8], selector); + w[14] = __byte_perm_S (w[ 8], w[ 7], selector); + w[13] = __byte_perm_S (w[ 7], w[ 6], selector); + w[12] = __byte_perm_S (w[ 6], w[ 5], selector); + w[11] = __byte_perm_S (w[ 5], w[ 4], selector); + w[10] = __byte_perm_S (w[ 4], w[ 3], selector); + w[ 9] = __byte_perm_S (w[ 3], w[ 2], selector); + w[ 8] = __byte_perm_S (w[ 2], w[ 1], selector); + w[ 7] = __byte_perm_S (w[ 1], w[ 0], selector); + w[ 6] = __byte_perm_S (w[ 0], 0, selector); + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 7: + w[63] = __byte_perm_S (w[56], w[55], selector); + w[62] = __byte_perm_S (w[55], w[54], selector); + w[61] = __byte_perm_S (w[54], w[53], selector); + w[60] = __byte_perm_S (w[53], w[52], selector); + w[59] = __byte_perm_S (w[52], w[51], selector); + w[58] = __byte_perm_S (w[51], w[50], selector); + w[57] = __byte_perm_S (w[50], w[49], selector); + w[56] = __byte_perm_S (w[49], w[48], selector); + w[55] = __byte_perm_S (w[48], w[47], selector); + w[54] = __byte_perm_S (w[47], w[46], selector); + w[53] = __byte_perm_S (w[46], w[45], selector); + w[52] = __byte_perm_S (w[45], w[44], selector); + w[51] = __byte_perm_S (w[44], w[43], selector); + w[50] = __byte_perm_S (w[43], w[42], selector); + w[49] = __byte_perm_S (w[42], w[41], selector); + w[48] = __byte_perm_S (w[41], w[40], selector); + w[47] = __byte_perm_S (w[40], w[39], selector); + w[46] = __byte_perm_S (w[39], w[38], selector); + w[45] = __byte_perm_S (w[38], w[37], selector); + w[44] = __byte_perm_S (w[37], w[36], selector); + w[43] = __byte_perm_S (w[36], w[35], selector); + w[42] = __byte_perm_S (w[35], w[34], selector); + w[41] = __byte_perm_S (w[34], w[33], selector); + w[40] = __byte_perm_S (w[33], w[32], selector); + w[39] = __byte_perm_S (w[32], w[31], selector); + w[38] = __byte_perm_S (w[31], w[30], selector); + w[37] = __byte_perm_S (w[30], w[29], selector); + w[36] = __byte_perm_S (w[29], w[28], selector); + w[35] = __byte_perm_S (w[28], w[27], selector); + w[34] = __byte_perm_S (w[27], w[26], selector); + w[33] = __byte_perm_S (w[26], w[25], selector); + w[32] = __byte_perm_S (w[25], w[24], selector); + w[31] = __byte_perm_S (w[24], w[23], selector); + w[30] = __byte_perm_S (w[23], w[22], selector); + w[29] = __byte_perm_S (w[22], w[21], selector); + w[28] = __byte_perm_S (w[21], w[20], selector); + w[27] = __byte_perm_S (w[20], w[19], selector); + w[26] = __byte_perm_S (w[19], w[18], selector); + w[25] = __byte_perm_S (w[18], w[17], selector); + w[24] = __byte_perm_S (w[17], w[16], selector); + w[23] = __byte_perm_S (w[16], w[15], selector); + w[22] = __byte_perm_S (w[15], w[14], selector); + w[21] = __byte_perm_S (w[14], w[13], selector); + w[20] = __byte_perm_S (w[13], w[12], selector); + w[19] = __byte_perm_S (w[12], w[11], selector); + w[18] = __byte_perm_S (w[11], w[10], selector); + w[17] = __byte_perm_S (w[10], w[ 9], selector); + w[16] = __byte_perm_S (w[ 9], w[ 8], selector); + w[15] = __byte_perm_S (w[ 8], w[ 7], selector); + w[14] = __byte_perm_S (w[ 7], w[ 6], selector); + w[13] = __byte_perm_S (w[ 6], w[ 5], selector); + w[12] = __byte_perm_S (w[ 5], w[ 4], selector); + w[11] = __byte_perm_S (w[ 4], w[ 3], selector); + w[10] = __byte_perm_S (w[ 3], w[ 2], selector); + w[ 9] = __byte_perm_S (w[ 2], w[ 1], selector); + w[ 8] = __byte_perm_S (w[ 1], w[ 0], selector); + w[ 7] = __byte_perm_S (w[ 0], 0, selector); + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 8: + w[63] = __byte_perm_S (w[55], w[54], selector); + w[62] = __byte_perm_S (w[54], w[53], selector); + w[61] = __byte_perm_S (w[53], w[52], selector); + w[60] = __byte_perm_S (w[52], w[51], selector); + w[59] = __byte_perm_S (w[51], w[50], selector); + w[58] = __byte_perm_S (w[50], w[49], selector); + w[57] = __byte_perm_S (w[49], w[48], selector); + w[56] = __byte_perm_S (w[48], w[47], selector); + w[55] = __byte_perm_S (w[47], w[46], selector); + w[54] = __byte_perm_S (w[46], w[45], selector); + w[53] = __byte_perm_S (w[45], w[44], selector); + w[52] = __byte_perm_S (w[44], w[43], selector); + w[51] = __byte_perm_S (w[43], w[42], selector); + w[50] = __byte_perm_S (w[42], w[41], selector); + w[49] = __byte_perm_S (w[41], w[40], selector); + w[48] = __byte_perm_S (w[40], w[39], selector); + w[47] = __byte_perm_S (w[39], w[38], selector); + w[46] = __byte_perm_S (w[38], w[37], selector); + w[45] = __byte_perm_S (w[37], w[36], selector); + w[44] = __byte_perm_S (w[36], w[35], selector); + w[43] = __byte_perm_S (w[35], w[34], selector); + w[42] = __byte_perm_S (w[34], w[33], selector); + w[41] = __byte_perm_S (w[33], w[32], selector); + w[40] = __byte_perm_S (w[32], w[31], selector); + w[39] = __byte_perm_S (w[31], w[30], selector); + w[38] = __byte_perm_S (w[30], w[29], selector); + w[37] = __byte_perm_S (w[29], w[28], selector); + w[36] = __byte_perm_S (w[28], w[27], selector); + w[35] = __byte_perm_S (w[27], w[26], selector); + w[34] = __byte_perm_S (w[26], w[25], selector); + w[33] = __byte_perm_S (w[25], w[24], selector); + w[32] = __byte_perm_S (w[24], w[23], selector); + w[31] = __byte_perm_S (w[23], w[22], selector); + w[30] = __byte_perm_S (w[22], w[21], selector); + w[29] = __byte_perm_S (w[21], w[20], selector); + w[28] = __byte_perm_S (w[20], w[19], selector); + w[27] = __byte_perm_S (w[19], w[18], selector); + w[26] = __byte_perm_S (w[18], w[17], selector); + w[25] = __byte_perm_S (w[17], w[16], selector); + w[24] = __byte_perm_S (w[16], w[15], selector); + w[23] = __byte_perm_S (w[15], w[14], selector); + w[22] = __byte_perm_S (w[14], w[13], selector); + w[21] = __byte_perm_S (w[13], w[12], selector); + w[20] = __byte_perm_S (w[12], w[11], selector); + w[19] = __byte_perm_S (w[11], w[10], selector); + w[18] = __byte_perm_S (w[10], w[ 9], selector); + w[17] = __byte_perm_S (w[ 9], w[ 8], selector); + w[16] = __byte_perm_S (w[ 8], w[ 7], selector); + w[15] = __byte_perm_S (w[ 7], w[ 6], selector); + w[14] = __byte_perm_S (w[ 6], w[ 5], selector); + w[13] = __byte_perm_S (w[ 5], w[ 4], selector); + w[12] = __byte_perm_S (w[ 4], w[ 3], selector); + w[11] = __byte_perm_S (w[ 3], w[ 2], selector); + w[10] = __byte_perm_S (w[ 2], w[ 1], selector); + w[ 9] = __byte_perm_S (w[ 1], w[ 0], selector); + w[ 8] = __byte_perm_S (w[ 0], 0, selector); + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 9: + w[63] = __byte_perm_S (w[54], w[53], selector); + w[62] = __byte_perm_S (w[53], w[52], selector); + w[61] = __byte_perm_S (w[52], w[51], selector); + w[60] = __byte_perm_S (w[51], w[50], selector); + w[59] = __byte_perm_S (w[50], w[49], selector); + w[58] = __byte_perm_S (w[49], w[48], selector); + w[57] = __byte_perm_S (w[48], w[47], selector); + w[56] = __byte_perm_S (w[47], w[46], selector); + w[55] = __byte_perm_S (w[46], w[45], selector); + w[54] = __byte_perm_S (w[45], w[44], selector); + w[53] = __byte_perm_S (w[44], w[43], selector); + w[52] = __byte_perm_S (w[43], w[42], selector); + w[51] = __byte_perm_S (w[42], w[41], selector); + w[50] = __byte_perm_S (w[41], w[40], selector); + w[49] = __byte_perm_S (w[40], w[39], selector); + w[48] = __byte_perm_S (w[39], w[38], selector); + w[47] = __byte_perm_S (w[38], w[37], selector); + w[46] = __byte_perm_S (w[37], w[36], selector); + w[45] = __byte_perm_S (w[36], w[35], selector); + w[44] = __byte_perm_S (w[35], w[34], selector); + w[43] = __byte_perm_S (w[34], w[33], selector); + w[42] = __byte_perm_S (w[33], w[32], selector); + w[41] = __byte_perm_S (w[32], w[31], selector); + w[40] = __byte_perm_S (w[31], w[30], selector); + w[39] = __byte_perm_S (w[30], w[29], selector); + w[38] = __byte_perm_S (w[29], w[28], selector); + w[37] = __byte_perm_S (w[28], w[27], selector); + w[36] = __byte_perm_S (w[27], w[26], selector); + w[35] = __byte_perm_S (w[26], w[25], selector); + w[34] = __byte_perm_S (w[25], w[24], selector); + w[33] = __byte_perm_S (w[24], w[23], selector); + w[32] = __byte_perm_S (w[23], w[22], selector); + w[31] = __byte_perm_S (w[22], w[21], selector); + w[30] = __byte_perm_S (w[21], w[20], selector); + w[29] = __byte_perm_S (w[20], w[19], selector); + w[28] = __byte_perm_S (w[19], w[18], selector); + w[27] = __byte_perm_S (w[18], w[17], selector); + w[26] = __byte_perm_S (w[17], w[16], selector); + w[25] = __byte_perm_S (w[16], w[15], selector); + w[24] = __byte_perm_S (w[15], w[14], selector); + w[23] = __byte_perm_S (w[14], w[13], selector); + w[22] = __byte_perm_S (w[13], w[12], selector); + w[21] = __byte_perm_S (w[12], w[11], selector); + w[20] = __byte_perm_S (w[11], w[10], selector); + w[19] = __byte_perm_S (w[10], w[ 9], selector); + w[18] = __byte_perm_S (w[ 9], w[ 8], selector); + w[17] = __byte_perm_S (w[ 8], w[ 7], selector); + w[16] = __byte_perm_S (w[ 7], w[ 6], selector); + w[15] = __byte_perm_S (w[ 6], w[ 5], selector); + w[14] = __byte_perm_S (w[ 5], w[ 4], selector); + w[13] = __byte_perm_S (w[ 4], w[ 3], selector); + w[12] = __byte_perm_S (w[ 3], w[ 2], selector); + w[11] = __byte_perm_S (w[ 2], w[ 1], selector); + w[10] = __byte_perm_S (w[ 1], w[ 0], selector); + w[ 9] = __byte_perm_S (w[ 0], 0, selector); + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 10: + w[63] = __byte_perm_S (w[53], w[52], selector); + w[62] = __byte_perm_S (w[52], w[51], selector); + w[61] = __byte_perm_S (w[51], w[50], selector); + w[60] = __byte_perm_S (w[50], w[49], selector); + w[59] = __byte_perm_S (w[49], w[48], selector); + w[58] = __byte_perm_S (w[48], w[47], selector); + w[57] = __byte_perm_S (w[47], w[46], selector); + w[56] = __byte_perm_S (w[46], w[45], selector); + w[55] = __byte_perm_S (w[45], w[44], selector); + w[54] = __byte_perm_S (w[44], w[43], selector); + w[53] = __byte_perm_S (w[43], w[42], selector); + w[52] = __byte_perm_S (w[42], w[41], selector); + w[51] = __byte_perm_S (w[41], w[40], selector); + w[50] = __byte_perm_S (w[40], w[39], selector); + w[49] = __byte_perm_S (w[39], w[38], selector); + w[48] = __byte_perm_S (w[38], w[37], selector); + w[47] = __byte_perm_S (w[37], w[36], selector); + w[46] = __byte_perm_S (w[36], w[35], selector); + w[45] = __byte_perm_S (w[35], w[34], selector); + w[44] = __byte_perm_S (w[34], w[33], selector); + w[43] = __byte_perm_S (w[33], w[32], selector); + w[42] = __byte_perm_S (w[32], w[31], selector); + w[41] = __byte_perm_S (w[31], w[30], selector); + w[40] = __byte_perm_S (w[30], w[29], selector); + w[39] = __byte_perm_S (w[29], w[28], selector); + w[38] = __byte_perm_S (w[28], w[27], selector); + w[37] = __byte_perm_S (w[27], w[26], selector); + w[36] = __byte_perm_S (w[26], w[25], selector); + w[35] = __byte_perm_S (w[25], w[24], selector); + w[34] = __byte_perm_S (w[24], w[23], selector); + w[33] = __byte_perm_S (w[23], w[22], selector); + w[32] = __byte_perm_S (w[22], w[21], selector); + w[31] = __byte_perm_S (w[21], w[20], selector); + w[30] = __byte_perm_S (w[20], w[19], selector); + w[29] = __byte_perm_S (w[19], w[18], selector); + w[28] = __byte_perm_S (w[18], w[17], selector); + w[27] = __byte_perm_S (w[17], w[16], selector); + w[26] = __byte_perm_S (w[16], w[15], selector); + w[25] = __byte_perm_S (w[15], w[14], selector); + w[24] = __byte_perm_S (w[14], w[13], selector); + w[23] = __byte_perm_S (w[13], w[12], selector); + w[22] = __byte_perm_S (w[12], w[11], selector); + w[21] = __byte_perm_S (w[11], w[10], selector); + w[20] = __byte_perm_S (w[10], w[ 9], selector); + w[19] = __byte_perm_S (w[ 9], w[ 8], selector); + w[18] = __byte_perm_S (w[ 8], w[ 7], selector); + w[17] = __byte_perm_S (w[ 7], w[ 6], selector); + w[16] = __byte_perm_S (w[ 6], w[ 5], selector); + w[15] = __byte_perm_S (w[ 5], w[ 4], selector); + w[14] = __byte_perm_S (w[ 4], w[ 3], selector); + w[13] = __byte_perm_S (w[ 3], w[ 2], selector); + w[12] = __byte_perm_S (w[ 2], w[ 1], selector); + w[11] = __byte_perm_S (w[ 1], w[ 0], selector); + w[10] = __byte_perm_S (w[ 0], 0, selector); + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 11: + w[63] = __byte_perm_S (w[52], w[51], selector); + w[62] = __byte_perm_S (w[51], w[50], selector); + w[61] = __byte_perm_S (w[50], w[49], selector); + w[60] = __byte_perm_S (w[49], w[48], selector); + w[59] = __byte_perm_S (w[48], w[47], selector); + w[58] = __byte_perm_S (w[47], w[46], selector); + w[57] = __byte_perm_S (w[46], w[45], selector); + w[56] = __byte_perm_S (w[45], w[44], selector); + w[55] = __byte_perm_S (w[44], w[43], selector); + w[54] = __byte_perm_S (w[43], w[42], selector); + w[53] = __byte_perm_S (w[42], w[41], selector); + w[52] = __byte_perm_S (w[41], w[40], selector); + w[51] = __byte_perm_S (w[40], w[39], selector); + w[50] = __byte_perm_S (w[39], w[38], selector); + w[49] = __byte_perm_S (w[38], w[37], selector); + w[48] = __byte_perm_S (w[37], w[36], selector); + w[47] = __byte_perm_S (w[36], w[35], selector); + w[46] = __byte_perm_S (w[35], w[34], selector); + w[45] = __byte_perm_S (w[34], w[33], selector); + w[44] = __byte_perm_S (w[33], w[32], selector); + w[43] = __byte_perm_S (w[32], w[31], selector); + w[42] = __byte_perm_S (w[31], w[30], selector); + w[41] = __byte_perm_S (w[30], w[29], selector); + w[40] = __byte_perm_S (w[29], w[28], selector); + w[39] = __byte_perm_S (w[28], w[27], selector); + w[38] = __byte_perm_S (w[27], w[26], selector); + w[37] = __byte_perm_S (w[26], w[25], selector); + w[36] = __byte_perm_S (w[25], w[24], selector); + w[35] = __byte_perm_S (w[24], w[23], selector); + w[34] = __byte_perm_S (w[23], w[22], selector); + w[33] = __byte_perm_S (w[22], w[21], selector); + w[32] = __byte_perm_S (w[21], w[20], selector); + w[31] = __byte_perm_S (w[20], w[19], selector); + w[30] = __byte_perm_S (w[19], w[18], selector); + w[29] = __byte_perm_S (w[18], w[17], selector); + w[28] = __byte_perm_S (w[17], w[16], selector); + w[27] = __byte_perm_S (w[16], w[15], selector); + w[26] = __byte_perm_S (w[15], w[14], selector); + w[25] = __byte_perm_S (w[14], w[13], selector); + w[24] = __byte_perm_S (w[13], w[12], selector); + w[23] = __byte_perm_S (w[12], w[11], selector); + w[22] = __byte_perm_S (w[11], w[10], selector); + w[21] = __byte_perm_S (w[10], w[ 9], selector); + w[20] = __byte_perm_S (w[ 9], w[ 8], selector); + w[19] = __byte_perm_S (w[ 8], w[ 7], selector); + w[18] = __byte_perm_S (w[ 7], w[ 6], selector); + w[17] = __byte_perm_S (w[ 6], w[ 5], selector); + w[16] = __byte_perm_S (w[ 5], w[ 4], selector); + w[15] = __byte_perm_S (w[ 4], w[ 3], selector); + w[14] = __byte_perm_S (w[ 3], w[ 2], selector); + w[13] = __byte_perm_S (w[ 2], w[ 1], selector); + w[12] = __byte_perm_S (w[ 1], w[ 0], selector); + w[11] = __byte_perm_S (w[ 0], 0, selector); + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 12: + w[63] = __byte_perm_S (w[51], w[50], selector); + w[62] = __byte_perm_S (w[50], w[49], selector); + w[61] = __byte_perm_S (w[49], w[48], selector); + w[60] = __byte_perm_S (w[48], w[47], selector); + w[59] = __byte_perm_S (w[47], w[46], selector); + w[58] = __byte_perm_S (w[46], w[45], selector); + w[57] = __byte_perm_S (w[45], w[44], selector); + w[56] = __byte_perm_S (w[44], w[43], selector); + w[55] = __byte_perm_S (w[43], w[42], selector); + w[54] = __byte_perm_S (w[42], w[41], selector); + w[53] = __byte_perm_S (w[41], w[40], selector); + w[52] = __byte_perm_S (w[40], w[39], selector); + w[51] = __byte_perm_S (w[39], w[38], selector); + w[50] = __byte_perm_S (w[38], w[37], selector); + w[49] = __byte_perm_S (w[37], w[36], selector); + w[48] = __byte_perm_S (w[36], w[35], selector); + w[47] = __byte_perm_S (w[35], w[34], selector); + w[46] = __byte_perm_S (w[34], w[33], selector); + w[45] = __byte_perm_S (w[33], w[32], selector); + w[44] = __byte_perm_S (w[32], w[31], selector); + w[43] = __byte_perm_S (w[31], w[30], selector); + w[42] = __byte_perm_S (w[30], w[29], selector); + w[41] = __byte_perm_S (w[29], w[28], selector); + w[40] = __byte_perm_S (w[28], w[27], selector); + w[39] = __byte_perm_S (w[27], w[26], selector); + w[38] = __byte_perm_S (w[26], w[25], selector); + w[37] = __byte_perm_S (w[25], w[24], selector); + w[36] = __byte_perm_S (w[24], w[23], selector); + w[35] = __byte_perm_S (w[23], w[22], selector); + w[34] = __byte_perm_S (w[22], w[21], selector); + w[33] = __byte_perm_S (w[21], w[20], selector); + w[32] = __byte_perm_S (w[20], w[19], selector); + w[31] = __byte_perm_S (w[19], w[18], selector); + w[30] = __byte_perm_S (w[18], w[17], selector); + w[29] = __byte_perm_S (w[17], w[16], selector); + w[28] = __byte_perm_S (w[16], w[15], selector); + w[27] = __byte_perm_S (w[15], w[14], selector); + w[26] = __byte_perm_S (w[14], w[13], selector); + w[25] = __byte_perm_S (w[13], w[12], selector); + w[24] = __byte_perm_S (w[12], w[11], selector); + w[23] = __byte_perm_S (w[11], w[10], selector); + w[22] = __byte_perm_S (w[10], w[ 9], selector); + w[21] = __byte_perm_S (w[ 9], w[ 8], selector); + w[20] = __byte_perm_S (w[ 8], w[ 7], selector); + w[19] = __byte_perm_S (w[ 7], w[ 6], selector); + w[18] = __byte_perm_S (w[ 6], w[ 5], selector); + w[17] = __byte_perm_S (w[ 5], w[ 4], selector); + w[16] = __byte_perm_S (w[ 4], w[ 3], selector); + w[15] = __byte_perm_S (w[ 3], w[ 2], selector); + w[14] = __byte_perm_S (w[ 2], w[ 1], selector); + w[13] = __byte_perm_S (w[ 1], w[ 0], selector); + w[12] = __byte_perm_S (w[ 0], 0, selector); + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 13: + w[63] = __byte_perm_S (w[50], w[49], selector); + w[62] = __byte_perm_S (w[49], w[48], selector); + w[61] = __byte_perm_S (w[48], w[47], selector); + w[60] = __byte_perm_S (w[47], w[46], selector); + w[59] = __byte_perm_S (w[46], w[45], selector); + w[58] = __byte_perm_S (w[45], w[44], selector); + w[57] = __byte_perm_S (w[44], w[43], selector); + w[56] = __byte_perm_S (w[43], w[42], selector); + w[55] = __byte_perm_S (w[42], w[41], selector); + w[54] = __byte_perm_S (w[41], w[40], selector); + w[53] = __byte_perm_S (w[40], w[39], selector); + w[52] = __byte_perm_S (w[39], w[38], selector); + w[51] = __byte_perm_S (w[38], w[37], selector); + w[50] = __byte_perm_S (w[37], w[36], selector); + w[49] = __byte_perm_S (w[36], w[35], selector); + w[48] = __byte_perm_S (w[35], w[34], selector); + w[47] = __byte_perm_S (w[34], w[33], selector); + w[46] = __byte_perm_S (w[33], w[32], selector); + w[45] = __byte_perm_S (w[32], w[31], selector); + w[44] = __byte_perm_S (w[31], w[30], selector); + w[43] = __byte_perm_S (w[30], w[29], selector); + w[42] = __byte_perm_S (w[29], w[28], selector); + w[41] = __byte_perm_S (w[28], w[27], selector); + w[40] = __byte_perm_S (w[27], w[26], selector); + w[39] = __byte_perm_S (w[26], w[25], selector); + w[38] = __byte_perm_S (w[25], w[24], selector); + w[37] = __byte_perm_S (w[24], w[23], selector); + w[36] = __byte_perm_S (w[23], w[22], selector); + w[35] = __byte_perm_S (w[22], w[21], selector); + w[34] = __byte_perm_S (w[21], w[20], selector); + w[33] = __byte_perm_S (w[20], w[19], selector); + w[32] = __byte_perm_S (w[19], w[18], selector); + w[31] = __byte_perm_S (w[18], w[17], selector); + w[30] = __byte_perm_S (w[17], w[16], selector); + w[29] = __byte_perm_S (w[16], w[15], selector); + w[28] = __byte_perm_S (w[15], w[14], selector); + w[27] = __byte_perm_S (w[14], w[13], selector); + w[26] = __byte_perm_S (w[13], w[12], selector); + w[25] = __byte_perm_S (w[12], w[11], selector); + w[24] = __byte_perm_S (w[11], w[10], selector); + w[23] = __byte_perm_S (w[10], w[ 9], selector); + w[22] = __byte_perm_S (w[ 9], w[ 8], selector); + w[21] = __byte_perm_S (w[ 8], w[ 7], selector); + w[20] = __byte_perm_S (w[ 7], w[ 6], selector); + w[19] = __byte_perm_S (w[ 6], w[ 5], selector); + w[18] = __byte_perm_S (w[ 5], w[ 4], selector); + w[17] = __byte_perm_S (w[ 4], w[ 3], selector); + w[16] = __byte_perm_S (w[ 3], w[ 2], selector); + w[15] = __byte_perm_S (w[ 2], w[ 1], selector); + w[14] = __byte_perm_S (w[ 1], w[ 0], selector); + w[13] = __byte_perm_S (w[ 0], 0, selector); + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 14: + w[63] = __byte_perm_S (w[49], w[48], selector); + w[62] = __byte_perm_S (w[48], w[47], selector); + w[61] = __byte_perm_S (w[47], w[46], selector); + w[60] = __byte_perm_S (w[46], w[45], selector); + w[59] = __byte_perm_S (w[45], w[44], selector); + w[58] = __byte_perm_S (w[44], w[43], selector); + w[57] = __byte_perm_S (w[43], w[42], selector); + w[56] = __byte_perm_S (w[42], w[41], selector); + w[55] = __byte_perm_S (w[41], w[40], selector); + w[54] = __byte_perm_S (w[40], w[39], selector); + w[53] = __byte_perm_S (w[39], w[38], selector); + w[52] = __byte_perm_S (w[38], w[37], selector); + w[51] = __byte_perm_S (w[37], w[36], selector); + w[50] = __byte_perm_S (w[36], w[35], selector); + w[49] = __byte_perm_S (w[35], w[34], selector); + w[48] = __byte_perm_S (w[34], w[33], selector); + w[47] = __byte_perm_S (w[33], w[32], selector); + w[46] = __byte_perm_S (w[32], w[31], selector); + w[45] = __byte_perm_S (w[31], w[30], selector); + w[44] = __byte_perm_S (w[30], w[29], selector); + w[43] = __byte_perm_S (w[29], w[28], selector); + w[42] = __byte_perm_S (w[28], w[27], selector); + w[41] = __byte_perm_S (w[27], w[26], selector); + w[40] = __byte_perm_S (w[26], w[25], selector); + w[39] = __byte_perm_S (w[25], w[24], selector); + w[38] = __byte_perm_S (w[24], w[23], selector); + w[37] = __byte_perm_S (w[23], w[22], selector); + w[36] = __byte_perm_S (w[22], w[21], selector); + w[35] = __byte_perm_S (w[21], w[20], selector); + w[34] = __byte_perm_S (w[20], w[19], selector); + w[33] = __byte_perm_S (w[19], w[18], selector); + w[32] = __byte_perm_S (w[18], w[17], selector); + w[31] = __byte_perm_S (w[17], w[16], selector); + w[30] = __byte_perm_S (w[16], w[15], selector); + w[29] = __byte_perm_S (w[15], w[14], selector); + w[28] = __byte_perm_S (w[14], w[13], selector); + w[27] = __byte_perm_S (w[13], w[12], selector); + w[26] = __byte_perm_S (w[12], w[11], selector); + w[25] = __byte_perm_S (w[11], w[10], selector); + w[24] = __byte_perm_S (w[10], w[ 9], selector); + w[23] = __byte_perm_S (w[ 9], w[ 8], selector); + w[22] = __byte_perm_S (w[ 8], w[ 7], selector); + w[21] = __byte_perm_S (w[ 7], w[ 6], selector); + w[20] = __byte_perm_S (w[ 6], w[ 5], selector); + w[19] = __byte_perm_S (w[ 5], w[ 4], selector); + w[18] = __byte_perm_S (w[ 4], w[ 3], selector); + w[17] = __byte_perm_S (w[ 3], w[ 2], selector); + w[16] = __byte_perm_S (w[ 2], w[ 1], selector); + w[15] = __byte_perm_S (w[ 1], w[ 0], selector); + w[14] = __byte_perm_S (w[ 0], 0, selector); + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 15: + w[63] = __byte_perm_S (w[48], w[47], selector); + w[62] = __byte_perm_S (w[47], w[46], selector); + w[61] = __byte_perm_S (w[46], w[45], selector); + w[60] = __byte_perm_S (w[45], w[44], selector); + w[59] = __byte_perm_S (w[44], w[43], selector); + w[58] = __byte_perm_S (w[43], w[42], selector); + w[57] = __byte_perm_S (w[42], w[41], selector); + w[56] = __byte_perm_S (w[41], w[40], selector); + w[55] = __byte_perm_S (w[40], w[39], selector); + w[54] = __byte_perm_S (w[39], w[38], selector); + w[53] = __byte_perm_S (w[38], w[37], selector); + w[52] = __byte_perm_S (w[37], w[36], selector); + w[51] = __byte_perm_S (w[36], w[35], selector); + w[50] = __byte_perm_S (w[35], w[34], selector); + w[49] = __byte_perm_S (w[34], w[33], selector); + w[48] = __byte_perm_S (w[33], w[32], selector); + w[47] = __byte_perm_S (w[32], w[31], selector); + w[46] = __byte_perm_S (w[31], w[30], selector); + w[45] = __byte_perm_S (w[30], w[29], selector); + w[44] = __byte_perm_S (w[29], w[28], selector); + w[43] = __byte_perm_S (w[28], w[27], selector); + w[42] = __byte_perm_S (w[27], w[26], selector); + w[41] = __byte_perm_S (w[26], w[25], selector); + w[40] = __byte_perm_S (w[25], w[24], selector); + w[39] = __byte_perm_S (w[24], w[23], selector); + w[38] = __byte_perm_S (w[23], w[22], selector); + w[37] = __byte_perm_S (w[22], w[21], selector); + w[36] = __byte_perm_S (w[21], w[20], selector); + w[35] = __byte_perm_S (w[20], w[19], selector); + w[34] = __byte_perm_S (w[19], w[18], selector); + w[33] = __byte_perm_S (w[18], w[17], selector); + w[32] = __byte_perm_S (w[17], w[16], selector); + w[31] = __byte_perm_S (w[16], w[15], selector); + w[30] = __byte_perm_S (w[15], w[14], selector); + w[29] = __byte_perm_S (w[14], w[13], selector); + w[28] = __byte_perm_S (w[13], w[12], selector); + w[27] = __byte_perm_S (w[12], w[11], selector); + w[26] = __byte_perm_S (w[11], w[10], selector); + w[25] = __byte_perm_S (w[10], w[ 9], selector); + w[24] = __byte_perm_S (w[ 9], w[ 8], selector); + w[23] = __byte_perm_S (w[ 8], w[ 7], selector); + w[22] = __byte_perm_S (w[ 7], w[ 6], selector); + w[21] = __byte_perm_S (w[ 6], w[ 5], selector); + w[20] = __byte_perm_S (w[ 5], w[ 4], selector); + w[19] = __byte_perm_S (w[ 4], w[ 3], selector); + w[18] = __byte_perm_S (w[ 3], w[ 2], selector); + w[17] = __byte_perm_S (w[ 2], w[ 1], selector); + w[16] = __byte_perm_S (w[ 1], w[ 0], selector); + w[15] = __byte_perm_S (w[ 0], 0, selector); + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 16: + w[63] = __byte_perm_S (w[47], w[46], selector); + w[62] = __byte_perm_S (w[46], w[45], selector); + w[61] = __byte_perm_S (w[45], w[44], selector); + w[60] = __byte_perm_S (w[44], w[43], selector); + w[59] = __byte_perm_S (w[43], w[42], selector); + w[58] = __byte_perm_S (w[42], w[41], selector); + w[57] = __byte_perm_S (w[41], w[40], selector); + w[56] = __byte_perm_S (w[40], w[39], selector); + w[55] = __byte_perm_S (w[39], w[38], selector); + w[54] = __byte_perm_S (w[38], w[37], selector); + w[53] = __byte_perm_S (w[37], w[36], selector); + w[52] = __byte_perm_S (w[36], w[35], selector); + w[51] = __byte_perm_S (w[35], w[34], selector); + w[50] = __byte_perm_S (w[34], w[33], selector); + w[49] = __byte_perm_S (w[33], w[32], selector); + w[48] = __byte_perm_S (w[32], w[31], selector); + w[47] = __byte_perm_S (w[31], w[30], selector); + w[46] = __byte_perm_S (w[30], w[29], selector); + w[45] = __byte_perm_S (w[29], w[28], selector); + w[44] = __byte_perm_S (w[28], w[27], selector); + w[43] = __byte_perm_S (w[27], w[26], selector); + w[42] = __byte_perm_S (w[26], w[25], selector); + w[41] = __byte_perm_S (w[25], w[24], selector); + w[40] = __byte_perm_S (w[24], w[23], selector); + w[39] = __byte_perm_S (w[23], w[22], selector); + w[38] = __byte_perm_S (w[22], w[21], selector); + w[37] = __byte_perm_S (w[21], w[20], selector); + w[36] = __byte_perm_S (w[20], w[19], selector); + w[35] = __byte_perm_S (w[19], w[18], selector); + w[34] = __byte_perm_S (w[18], w[17], selector); + w[33] = __byte_perm_S (w[17], w[16], selector); + w[32] = __byte_perm_S (w[16], w[15], selector); + w[31] = __byte_perm_S (w[15], w[14], selector); + w[30] = __byte_perm_S (w[14], w[13], selector); + w[29] = __byte_perm_S (w[13], w[12], selector); + w[28] = __byte_perm_S (w[12], w[11], selector); + w[27] = __byte_perm_S (w[11], w[10], selector); + w[26] = __byte_perm_S (w[10], w[ 9], selector); + w[25] = __byte_perm_S (w[ 9], w[ 8], selector); + w[24] = __byte_perm_S (w[ 8], w[ 7], selector); + w[23] = __byte_perm_S (w[ 7], w[ 6], selector); + w[22] = __byte_perm_S (w[ 6], w[ 5], selector); + w[21] = __byte_perm_S (w[ 5], w[ 4], selector); + w[20] = __byte_perm_S (w[ 4], w[ 3], selector); + w[19] = __byte_perm_S (w[ 3], w[ 2], selector); + w[18] = __byte_perm_S (w[ 2], w[ 1], selector); + w[17] = __byte_perm_S (w[ 1], w[ 0], selector); + w[16] = __byte_perm_S (w[ 0], 0, selector); + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 17: + w[63] = __byte_perm_S (w[46], w[45], selector); + w[62] = __byte_perm_S (w[45], w[44], selector); + w[61] = __byte_perm_S (w[44], w[43], selector); + w[60] = __byte_perm_S (w[43], w[42], selector); + w[59] = __byte_perm_S (w[42], w[41], selector); + w[58] = __byte_perm_S (w[41], w[40], selector); + w[57] = __byte_perm_S (w[40], w[39], selector); + w[56] = __byte_perm_S (w[39], w[38], selector); + w[55] = __byte_perm_S (w[38], w[37], selector); + w[54] = __byte_perm_S (w[37], w[36], selector); + w[53] = __byte_perm_S (w[36], w[35], selector); + w[52] = __byte_perm_S (w[35], w[34], selector); + w[51] = __byte_perm_S (w[34], w[33], selector); + w[50] = __byte_perm_S (w[33], w[32], selector); + w[49] = __byte_perm_S (w[32], w[31], selector); + w[48] = __byte_perm_S (w[31], w[30], selector); + w[47] = __byte_perm_S (w[30], w[29], selector); + w[46] = __byte_perm_S (w[29], w[28], selector); + w[45] = __byte_perm_S (w[28], w[27], selector); + w[44] = __byte_perm_S (w[27], w[26], selector); + w[43] = __byte_perm_S (w[26], w[25], selector); + w[42] = __byte_perm_S (w[25], w[24], selector); + w[41] = __byte_perm_S (w[24], w[23], selector); + w[40] = __byte_perm_S (w[23], w[22], selector); + w[39] = __byte_perm_S (w[22], w[21], selector); + w[38] = __byte_perm_S (w[21], w[20], selector); + w[37] = __byte_perm_S (w[20], w[19], selector); + w[36] = __byte_perm_S (w[19], w[18], selector); + w[35] = __byte_perm_S (w[18], w[17], selector); + w[34] = __byte_perm_S (w[17], w[16], selector); + w[33] = __byte_perm_S (w[16], w[15], selector); + w[32] = __byte_perm_S (w[15], w[14], selector); + w[31] = __byte_perm_S (w[14], w[13], selector); + w[30] = __byte_perm_S (w[13], w[12], selector); + w[29] = __byte_perm_S (w[12], w[11], selector); + w[28] = __byte_perm_S (w[11], w[10], selector); + w[27] = __byte_perm_S (w[10], w[ 9], selector); + w[26] = __byte_perm_S (w[ 9], w[ 8], selector); + w[25] = __byte_perm_S (w[ 8], w[ 7], selector); + w[24] = __byte_perm_S (w[ 7], w[ 6], selector); + w[23] = __byte_perm_S (w[ 6], w[ 5], selector); + w[22] = __byte_perm_S (w[ 5], w[ 4], selector); + w[21] = __byte_perm_S (w[ 4], w[ 3], selector); + w[20] = __byte_perm_S (w[ 3], w[ 2], selector); + w[19] = __byte_perm_S (w[ 2], w[ 1], selector); + w[18] = __byte_perm_S (w[ 1], w[ 0], selector); + w[17] = __byte_perm_S (w[ 0], 0, selector); + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 18: + w[63] = __byte_perm_S (w[45], w[44], selector); + w[62] = __byte_perm_S (w[44], w[43], selector); + w[61] = __byte_perm_S (w[43], w[42], selector); + w[60] = __byte_perm_S (w[42], w[41], selector); + w[59] = __byte_perm_S (w[41], w[40], selector); + w[58] = __byte_perm_S (w[40], w[39], selector); + w[57] = __byte_perm_S (w[39], w[38], selector); + w[56] = __byte_perm_S (w[38], w[37], selector); + w[55] = __byte_perm_S (w[37], w[36], selector); + w[54] = __byte_perm_S (w[36], w[35], selector); + w[53] = __byte_perm_S (w[35], w[34], selector); + w[52] = __byte_perm_S (w[34], w[33], selector); + w[51] = __byte_perm_S (w[33], w[32], selector); + w[50] = __byte_perm_S (w[32], w[31], selector); + w[49] = __byte_perm_S (w[31], w[30], selector); + w[48] = __byte_perm_S (w[30], w[29], selector); + w[47] = __byte_perm_S (w[29], w[28], selector); + w[46] = __byte_perm_S (w[28], w[27], selector); + w[45] = __byte_perm_S (w[27], w[26], selector); + w[44] = __byte_perm_S (w[26], w[25], selector); + w[43] = __byte_perm_S (w[25], w[24], selector); + w[42] = __byte_perm_S (w[24], w[23], selector); + w[41] = __byte_perm_S (w[23], w[22], selector); + w[40] = __byte_perm_S (w[22], w[21], selector); + w[39] = __byte_perm_S (w[21], w[20], selector); + w[38] = __byte_perm_S (w[20], w[19], selector); + w[37] = __byte_perm_S (w[19], w[18], selector); + w[36] = __byte_perm_S (w[18], w[17], selector); + w[35] = __byte_perm_S (w[17], w[16], selector); + w[34] = __byte_perm_S (w[16], w[15], selector); + w[33] = __byte_perm_S (w[15], w[14], selector); + w[32] = __byte_perm_S (w[14], w[13], selector); + w[31] = __byte_perm_S (w[13], w[12], selector); + w[30] = __byte_perm_S (w[12], w[11], selector); + w[29] = __byte_perm_S (w[11], w[10], selector); + w[28] = __byte_perm_S (w[10], w[ 9], selector); + w[27] = __byte_perm_S (w[ 9], w[ 8], selector); + w[26] = __byte_perm_S (w[ 8], w[ 7], selector); + w[25] = __byte_perm_S (w[ 7], w[ 6], selector); + w[24] = __byte_perm_S (w[ 6], w[ 5], selector); + w[23] = __byte_perm_S (w[ 5], w[ 4], selector); + w[22] = __byte_perm_S (w[ 4], w[ 3], selector); + w[21] = __byte_perm_S (w[ 3], w[ 2], selector); + w[20] = __byte_perm_S (w[ 2], w[ 1], selector); + w[19] = __byte_perm_S (w[ 1], w[ 0], selector); + w[18] = __byte_perm_S (w[ 0], 0, selector); + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 19: + w[63] = __byte_perm_S (w[44], w[43], selector); + w[62] = __byte_perm_S (w[43], w[42], selector); + w[61] = __byte_perm_S (w[42], w[41], selector); + w[60] = __byte_perm_S (w[41], w[40], selector); + w[59] = __byte_perm_S (w[40], w[39], selector); + w[58] = __byte_perm_S (w[39], w[38], selector); + w[57] = __byte_perm_S (w[38], w[37], selector); + w[56] = __byte_perm_S (w[37], w[36], selector); + w[55] = __byte_perm_S (w[36], w[35], selector); + w[54] = __byte_perm_S (w[35], w[34], selector); + w[53] = __byte_perm_S (w[34], w[33], selector); + w[52] = __byte_perm_S (w[33], w[32], selector); + w[51] = __byte_perm_S (w[32], w[31], selector); + w[50] = __byte_perm_S (w[31], w[30], selector); + w[49] = __byte_perm_S (w[30], w[29], selector); + w[48] = __byte_perm_S (w[29], w[28], selector); + w[47] = __byte_perm_S (w[28], w[27], selector); + w[46] = __byte_perm_S (w[27], w[26], selector); + w[45] = __byte_perm_S (w[26], w[25], selector); + w[44] = __byte_perm_S (w[25], w[24], selector); + w[43] = __byte_perm_S (w[24], w[23], selector); + w[42] = __byte_perm_S (w[23], w[22], selector); + w[41] = __byte_perm_S (w[22], w[21], selector); + w[40] = __byte_perm_S (w[21], w[20], selector); + w[39] = __byte_perm_S (w[20], w[19], selector); + w[38] = __byte_perm_S (w[19], w[18], selector); + w[37] = __byte_perm_S (w[18], w[17], selector); + w[36] = __byte_perm_S (w[17], w[16], selector); + w[35] = __byte_perm_S (w[16], w[15], selector); + w[34] = __byte_perm_S (w[15], w[14], selector); + w[33] = __byte_perm_S (w[14], w[13], selector); + w[32] = __byte_perm_S (w[13], w[12], selector); + w[31] = __byte_perm_S (w[12], w[11], selector); + w[30] = __byte_perm_S (w[11], w[10], selector); + w[29] = __byte_perm_S (w[10], w[ 9], selector); + w[28] = __byte_perm_S (w[ 9], w[ 8], selector); + w[27] = __byte_perm_S (w[ 8], w[ 7], selector); + w[26] = __byte_perm_S (w[ 7], w[ 6], selector); + w[25] = __byte_perm_S (w[ 6], w[ 5], selector); + w[24] = __byte_perm_S (w[ 5], w[ 4], selector); + w[23] = __byte_perm_S (w[ 4], w[ 3], selector); + w[22] = __byte_perm_S (w[ 3], w[ 2], selector); + w[21] = __byte_perm_S (w[ 2], w[ 1], selector); + w[20] = __byte_perm_S (w[ 1], w[ 0], selector); + w[19] = __byte_perm_S (w[ 0], 0, selector); + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 20: + w[63] = __byte_perm_S (w[43], w[42], selector); + w[62] = __byte_perm_S (w[42], w[41], selector); + w[61] = __byte_perm_S (w[41], w[40], selector); + w[60] = __byte_perm_S (w[40], w[39], selector); + w[59] = __byte_perm_S (w[39], w[38], selector); + w[58] = __byte_perm_S (w[38], w[37], selector); + w[57] = __byte_perm_S (w[37], w[36], selector); + w[56] = __byte_perm_S (w[36], w[35], selector); + w[55] = __byte_perm_S (w[35], w[34], selector); + w[54] = __byte_perm_S (w[34], w[33], selector); + w[53] = __byte_perm_S (w[33], w[32], selector); + w[52] = __byte_perm_S (w[32], w[31], selector); + w[51] = __byte_perm_S (w[31], w[30], selector); + w[50] = __byte_perm_S (w[30], w[29], selector); + w[49] = __byte_perm_S (w[29], w[28], selector); + w[48] = __byte_perm_S (w[28], w[27], selector); + w[47] = __byte_perm_S (w[27], w[26], selector); + w[46] = __byte_perm_S (w[26], w[25], selector); + w[45] = __byte_perm_S (w[25], w[24], selector); + w[44] = __byte_perm_S (w[24], w[23], selector); + w[43] = __byte_perm_S (w[23], w[22], selector); + w[42] = __byte_perm_S (w[22], w[21], selector); + w[41] = __byte_perm_S (w[21], w[20], selector); + w[40] = __byte_perm_S (w[20], w[19], selector); + w[39] = __byte_perm_S (w[19], w[18], selector); + w[38] = __byte_perm_S (w[18], w[17], selector); + w[37] = __byte_perm_S (w[17], w[16], selector); + w[36] = __byte_perm_S (w[16], w[15], selector); + w[35] = __byte_perm_S (w[15], w[14], selector); + w[34] = __byte_perm_S (w[14], w[13], selector); + w[33] = __byte_perm_S (w[13], w[12], selector); + w[32] = __byte_perm_S (w[12], w[11], selector); + w[31] = __byte_perm_S (w[11], w[10], selector); + w[30] = __byte_perm_S (w[10], w[ 9], selector); + w[29] = __byte_perm_S (w[ 9], w[ 8], selector); + w[28] = __byte_perm_S (w[ 8], w[ 7], selector); + w[27] = __byte_perm_S (w[ 7], w[ 6], selector); + w[26] = __byte_perm_S (w[ 6], w[ 5], selector); + w[25] = __byte_perm_S (w[ 5], w[ 4], selector); + w[24] = __byte_perm_S (w[ 4], w[ 3], selector); + w[23] = __byte_perm_S (w[ 3], w[ 2], selector); + w[22] = __byte_perm_S (w[ 2], w[ 1], selector); + w[21] = __byte_perm_S (w[ 1], w[ 0], selector); + w[20] = __byte_perm_S (w[ 0], 0, selector); + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 21: + w[63] = __byte_perm_S (w[42], w[41], selector); + w[62] = __byte_perm_S (w[41], w[40], selector); + w[61] = __byte_perm_S (w[40], w[39], selector); + w[60] = __byte_perm_S (w[39], w[38], selector); + w[59] = __byte_perm_S (w[38], w[37], selector); + w[58] = __byte_perm_S (w[37], w[36], selector); + w[57] = __byte_perm_S (w[36], w[35], selector); + w[56] = __byte_perm_S (w[35], w[34], selector); + w[55] = __byte_perm_S (w[34], w[33], selector); + w[54] = __byte_perm_S (w[33], w[32], selector); + w[53] = __byte_perm_S (w[32], w[31], selector); + w[52] = __byte_perm_S (w[31], w[30], selector); + w[51] = __byte_perm_S (w[30], w[29], selector); + w[50] = __byte_perm_S (w[29], w[28], selector); + w[49] = __byte_perm_S (w[28], w[27], selector); + w[48] = __byte_perm_S (w[27], w[26], selector); + w[47] = __byte_perm_S (w[26], w[25], selector); + w[46] = __byte_perm_S (w[25], w[24], selector); + w[45] = __byte_perm_S (w[24], w[23], selector); + w[44] = __byte_perm_S (w[23], w[22], selector); + w[43] = __byte_perm_S (w[22], w[21], selector); + w[42] = __byte_perm_S (w[21], w[20], selector); + w[41] = __byte_perm_S (w[20], w[19], selector); + w[40] = __byte_perm_S (w[19], w[18], selector); + w[39] = __byte_perm_S (w[18], w[17], selector); + w[38] = __byte_perm_S (w[17], w[16], selector); + w[37] = __byte_perm_S (w[16], w[15], selector); + w[36] = __byte_perm_S (w[15], w[14], selector); + w[35] = __byte_perm_S (w[14], w[13], selector); + w[34] = __byte_perm_S (w[13], w[12], selector); + w[33] = __byte_perm_S (w[12], w[11], selector); + w[32] = __byte_perm_S (w[11], w[10], selector); + w[31] = __byte_perm_S (w[10], w[ 9], selector); + w[30] = __byte_perm_S (w[ 9], w[ 8], selector); + w[29] = __byte_perm_S (w[ 8], w[ 7], selector); + w[28] = __byte_perm_S (w[ 7], w[ 6], selector); + w[27] = __byte_perm_S (w[ 6], w[ 5], selector); + w[26] = __byte_perm_S (w[ 5], w[ 4], selector); + w[25] = __byte_perm_S (w[ 4], w[ 3], selector); + w[24] = __byte_perm_S (w[ 3], w[ 2], selector); + w[23] = __byte_perm_S (w[ 2], w[ 1], selector); + w[22] = __byte_perm_S (w[ 1], w[ 0], selector); + w[21] = __byte_perm_S (w[ 0], 0, selector); + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 22: + w[63] = __byte_perm_S (w[41], w[40], selector); + w[62] = __byte_perm_S (w[40], w[39], selector); + w[61] = __byte_perm_S (w[39], w[38], selector); + w[60] = __byte_perm_S (w[38], w[37], selector); + w[59] = __byte_perm_S (w[37], w[36], selector); + w[58] = __byte_perm_S (w[36], w[35], selector); + w[57] = __byte_perm_S (w[35], w[34], selector); + w[56] = __byte_perm_S (w[34], w[33], selector); + w[55] = __byte_perm_S (w[33], w[32], selector); + w[54] = __byte_perm_S (w[32], w[31], selector); + w[53] = __byte_perm_S (w[31], w[30], selector); + w[52] = __byte_perm_S (w[30], w[29], selector); + w[51] = __byte_perm_S (w[29], w[28], selector); + w[50] = __byte_perm_S (w[28], w[27], selector); + w[49] = __byte_perm_S (w[27], w[26], selector); + w[48] = __byte_perm_S (w[26], w[25], selector); + w[47] = __byte_perm_S (w[25], w[24], selector); + w[46] = __byte_perm_S (w[24], w[23], selector); + w[45] = __byte_perm_S (w[23], w[22], selector); + w[44] = __byte_perm_S (w[22], w[21], selector); + w[43] = __byte_perm_S (w[21], w[20], selector); + w[42] = __byte_perm_S (w[20], w[19], selector); + w[41] = __byte_perm_S (w[19], w[18], selector); + w[40] = __byte_perm_S (w[18], w[17], selector); + w[39] = __byte_perm_S (w[17], w[16], selector); + w[38] = __byte_perm_S (w[16], w[15], selector); + w[37] = __byte_perm_S (w[15], w[14], selector); + w[36] = __byte_perm_S (w[14], w[13], selector); + w[35] = __byte_perm_S (w[13], w[12], selector); + w[34] = __byte_perm_S (w[12], w[11], selector); + w[33] = __byte_perm_S (w[11], w[10], selector); + w[32] = __byte_perm_S (w[10], w[ 9], selector); + w[31] = __byte_perm_S (w[ 9], w[ 8], selector); + w[30] = __byte_perm_S (w[ 8], w[ 7], selector); + w[29] = __byte_perm_S (w[ 7], w[ 6], selector); + w[28] = __byte_perm_S (w[ 6], w[ 5], selector); + w[27] = __byte_perm_S (w[ 5], w[ 4], selector); + w[26] = __byte_perm_S (w[ 4], w[ 3], selector); + w[25] = __byte_perm_S (w[ 3], w[ 2], selector); + w[24] = __byte_perm_S (w[ 2], w[ 1], selector); + w[23] = __byte_perm_S (w[ 1], w[ 0], selector); + w[22] = __byte_perm_S (w[ 0], 0, selector); + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 23: + w[63] = __byte_perm_S (w[40], w[39], selector); + w[62] = __byte_perm_S (w[39], w[38], selector); + w[61] = __byte_perm_S (w[38], w[37], selector); + w[60] = __byte_perm_S (w[37], w[36], selector); + w[59] = __byte_perm_S (w[36], w[35], selector); + w[58] = __byte_perm_S (w[35], w[34], selector); + w[57] = __byte_perm_S (w[34], w[33], selector); + w[56] = __byte_perm_S (w[33], w[32], selector); + w[55] = __byte_perm_S (w[32], w[31], selector); + w[54] = __byte_perm_S (w[31], w[30], selector); + w[53] = __byte_perm_S (w[30], w[29], selector); + w[52] = __byte_perm_S (w[29], w[28], selector); + w[51] = __byte_perm_S (w[28], w[27], selector); + w[50] = __byte_perm_S (w[27], w[26], selector); + w[49] = __byte_perm_S (w[26], w[25], selector); + w[48] = __byte_perm_S (w[25], w[24], selector); + w[47] = __byte_perm_S (w[24], w[23], selector); + w[46] = __byte_perm_S (w[23], w[22], selector); + w[45] = __byte_perm_S (w[22], w[21], selector); + w[44] = __byte_perm_S (w[21], w[20], selector); + w[43] = __byte_perm_S (w[20], w[19], selector); + w[42] = __byte_perm_S (w[19], w[18], selector); + w[41] = __byte_perm_S (w[18], w[17], selector); + w[40] = __byte_perm_S (w[17], w[16], selector); + w[39] = __byte_perm_S (w[16], w[15], selector); + w[38] = __byte_perm_S (w[15], w[14], selector); + w[37] = __byte_perm_S (w[14], w[13], selector); + w[36] = __byte_perm_S (w[13], w[12], selector); + w[35] = __byte_perm_S (w[12], w[11], selector); + w[34] = __byte_perm_S (w[11], w[10], selector); + w[33] = __byte_perm_S (w[10], w[ 9], selector); + w[32] = __byte_perm_S (w[ 9], w[ 8], selector); + w[31] = __byte_perm_S (w[ 8], w[ 7], selector); + w[30] = __byte_perm_S (w[ 7], w[ 6], selector); + w[29] = __byte_perm_S (w[ 6], w[ 5], selector); + w[28] = __byte_perm_S (w[ 5], w[ 4], selector); + w[27] = __byte_perm_S (w[ 4], w[ 3], selector); + w[26] = __byte_perm_S (w[ 3], w[ 2], selector); + w[25] = __byte_perm_S (w[ 2], w[ 1], selector); + w[24] = __byte_perm_S (w[ 1], w[ 0], selector); + w[23] = __byte_perm_S (w[ 0], 0, selector); + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 24: + w[63] = __byte_perm_S (w[39], w[38], selector); + w[62] = __byte_perm_S (w[38], w[37], selector); + w[61] = __byte_perm_S (w[37], w[36], selector); + w[60] = __byte_perm_S (w[36], w[35], selector); + w[59] = __byte_perm_S (w[35], w[34], selector); + w[58] = __byte_perm_S (w[34], w[33], selector); + w[57] = __byte_perm_S (w[33], w[32], selector); + w[56] = __byte_perm_S (w[32], w[31], selector); + w[55] = __byte_perm_S (w[31], w[30], selector); + w[54] = __byte_perm_S (w[30], w[29], selector); + w[53] = __byte_perm_S (w[29], w[28], selector); + w[52] = __byte_perm_S (w[28], w[27], selector); + w[51] = __byte_perm_S (w[27], w[26], selector); + w[50] = __byte_perm_S (w[26], w[25], selector); + w[49] = __byte_perm_S (w[25], w[24], selector); + w[48] = __byte_perm_S (w[24], w[23], selector); + w[47] = __byte_perm_S (w[23], w[22], selector); + w[46] = __byte_perm_S (w[22], w[21], selector); + w[45] = __byte_perm_S (w[21], w[20], selector); + w[44] = __byte_perm_S (w[20], w[19], selector); + w[43] = __byte_perm_S (w[19], w[18], selector); + w[42] = __byte_perm_S (w[18], w[17], selector); + w[41] = __byte_perm_S (w[17], w[16], selector); + w[40] = __byte_perm_S (w[16], w[15], selector); + w[39] = __byte_perm_S (w[15], w[14], selector); + w[38] = __byte_perm_S (w[14], w[13], selector); + w[37] = __byte_perm_S (w[13], w[12], selector); + w[36] = __byte_perm_S (w[12], w[11], selector); + w[35] = __byte_perm_S (w[11], w[10], selector); + w[34] = __byte_perm_S (w[10], w[ 9], selector); + w[33] = __byte_perm_S (w[ 9], w[ 8], selector); + w[32] = __byte_perm_S (w[ 8], w[ 7], selector); + w[31] = __byte_perm_S (w[ 7], w[ 6], selector); + w[30] = __byte_perm_S (w[ 6], w[ 5], selector); + w[29] = __byte_perm_S (w[ 5], w[ 4], selector); + w[28] = __byte_perm_S (w[ 4], w[ 3], selector); + w[27] = __byte_perm_S (w[ 3], w[ 2], selector); + w[26] = __byte_perm_S (w[ 2], w[ 1], selector); + w[25] = __byte_perm_S (w[ 1], w[ 0], selector); + w[24] = __byte_perm_S (w[ 0], 0, selector); + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 25: + w[63] = __byte_perm_S (w[38], w[37], selector); + w[62] = __byte_perm_S (w[37], w[36], selector); + w[61] = __byte_perm_S (w[36], w[35], selector); + w[60] = __byte_perm_S (w[35], w[34], selector); + w[59] = __byte_perm_S (w[34], w[33], selector); + w[58] = __byte_perm_S (w[33], w[32], selector); + w[57] = __byte_perm_S (w[32], w[31], selector); + w[56] = __byte_perm_S (w[31], w[30], selector); + w[55] = __byte_perm_S (w[30], w[29], selector); + w[54] = __byte_perm_S (w[29], w[28], selector); + w[53] = __byte_perm_S (w[28], w[27], selector); + w[52] = __byte_perm_S (w[27], w[26], selector); + w[51] = __byte_perm_S (w[26], w[25], selector); + w[50] = __byte_perm_S (w[25], w[24], selector); + w[49] = __byte_perm_S (w[24], w[23], selector); + w[48] = __byte_perm_S (w[23], w[22], selector); + w[47] = __byte_perm_S (w[22], w[21], selector); + w[46] = __byte_perm_S (w[21], w[20], selector); + w[45] = __byte_perm_S (w[20], w[19], selector); + w[44] = __byte_perm_S (w[19], w[18], selector); + w[43] = __byte_perm_S (w[18], w[17], selector); + w[42] = __byte_perm_S (w[17], w[16], selector); + w[41] = __byte_perm_S (w[16], w[15], selector); + w[40] = __byte_perm_S (w[15], w[14], selector); + w[39] = __byte_perm_S (w[14], w[13], selector); + w[38] = __byte_perm_S (w[13], w[12], selector); + w[37] = __byte_perm_S (w[12], w[11], selector); + w[36] = __byte_perm_S (w[11], w[10], selector); + w[35] = __byte_perm_S (w[10], w[ 9], selector); + w[34] = __byte_perm_S (w[ 9], w[ 8], selector); + w[33] = __byte_perm_S (w[ 8], w[ 7], selector); + w[32] = __byte_perm_S (w[ 7], w[ 6], selector); + w[31] = __byte_perm_S (w[ 6], w[ 5], selector); + w[30] = __byte_perm_S (w[ 5], w[ 4], selector); + w[29] = __byte_perm_S (w[ 4], w[ 3], selector); + w[28] = __byte_perm_S (w[ 3], w[ 2], selector); + w[27] = __byte_perm_S (w[ 2], w[ 1], selector); + w[26] = __byte_perm_S (w[ 1], w[ 0], selector); + w[25] = __byte_perm_S (w[ 0], 0, selector); + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 26: + w[63] = __byte_perm_S (w[37], w[36], selector); + w[62] = __byte_perm_S (w[36], w[35], selector); + w[61] = __byte_perm_S (w[35], w[34], selector); + w[60] = __byte_perm_S (w[34], w[33], selector); + w[59] = __byte_perm_S (w[33], w[32], selector); + w[58] = __byte_perm_S (w[32], w[31], selector); + w[57] = __byte_perm_S (w[31], w[30], selector); + w[56] = __byte_perm_S (w[30], w[29], selector); + w[55] = __byte_perm_S (w[29], w[28], selector); + w[54] = __byte_perm_S (w[28], w[27], selector); + w[53] = __byte_perm_S (w[27], w[26], selector); + w[52] = __byte_perm_S (w[26], w[25], selector); + w[51] = __byte_perm_S (w[25], w[24], selector); + w[50] = __byte_perm_S (w[24], w[23], selector); + w[49] = __byte_perm_S (w[23], w[22], selector); + w[48] = __byte_perm_S (w[22], w[21], selector); + w[47] = __byte_perm_S (w[21], w[20], selector); + w[46] = __byte_perm_S (w[20], w[19], selector); + w[45] = __byte_perm_S (w[19], w[18], selector); + w[44] = __byte_perm_S (w[18], w[17], selector); + w[43] = __byte_perm_S (w[17], w[16], selector); + w[42] = __byte_perm_S (w[16], w[15], selector); + w[41] = __byte_perm_S (w[15], w[14], selector); + w[40] = __byte_perm_S (w[14], w[13], selector); + w[39] = __byte_perm_S (w[13], w[12], selector); + w[38] = __byte_perm_S (w[12], w[11], selector); + w[37] = __byte_perm_S (w[11], w[10], selector); + w[36] = __byte_perm_S (w[10], w[ 9], selector); + w[35] = __byte_perm_S (w[ 9], w[ 8], selector); + w[34] = __byte_perm_S (w[ 8], w[ 7], selector); + w[33] = __byte_perm_S (w[ 7], w[ 6], selector); + w[32] = __byte_perm_S (w[ 6], w[ 5], selector); + w[31] = __byte_perm_S (w[ 5], w[ 4], selector); + w[30] = __byte_perm_S (w[ 4], w[ 3], selector); + w[29] = __byte_perm_S (w[ 3], w[ 2], selector); + w[28] = __byte_perm_S (w[ 2], w[ 1], selector); + w[27] = __byte_perm_S (w[ 1], w[ 0], selector); + w[26] = __byte_perm_S (w[ 0], 0, selector); + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 27: + w[63] = __byte_perm_S (w[36], w[35], selector); + w[62] = __byte_perm_S (w[35], w[34], selector); + w[61] = __byte_perm_S (w[34], w[33], selector); + w[60] = __byte_perm_S (w[33], w[32], selector); + w[59] = __byte_perm_S (w[32], w[31], selector); + w[58] = __byte_perm_S (w[31], w[30], selector); + w[57] = __byte_perm_S (w[30], w[29], selector); + w[56] = __byte_perm_S (w[29], w[28], selector); + w[55] = __byte_perm_S (w[28], w[27], selector); + w[54] = __byte_perm_S (w[27], w[26], selector); + w[53] = __byte_perm_S (w[26], w[25], selector); + w[52] = __byte_perm_S (w[25], w[24], selector); + w[51] = __byte_perm_S (w[24], w[23], selector); + w[50] = __byte_perm_S (w[23], w[22], selector); + w[49] = __byte_perm_S (w[22], w[21], selector); + w[48] = __byte_perm_S (w[21], w[20], selector); + w[47] = __byte_perm_S (w[20], w[19], selector); + w[46] = __byte_perm_S (w[19], w[18], selector); + w[45] = __byte_perm_S (w[18], w[17], selector); + w[44] = __byte_perm_S (w[17], w[16], selector); + w[43] = __byte_perm_S (w[16], w[15], selector); + w[42] = __byte_perm_S (w[15], w[14], selector); + w[41] = __byte_perm_S (w[14], w[13], selector); + w[40] = __byte_perm_S (w[13], w[12], selector); + w[39] = __byte_perm_S (w[12], w[11], selector); + w[38] = __byte_perm_S (w[11], w[10], selector); + w[37] = __byte_perm_S (w[10], w[ 9], selector); + w[36] = __byte_perm_S (w[ 9], w[ 8], selector); + w[35] = __byte_perm_S (w[ 8], w[ 7], selector); + w[34] = __byte_perm_S (w[ 7], w[ 6], selector); + w[33] = __byte_perm_S (w[ 6], w[ 5], selector); + w[32] = __byte_perm_S (w[ 5], w[ 4], selector); + w[31] = __byte_perm_S (w[ 4], w[ 3], selector); + w[30] = __byte_perm_S (w[ 3], w[ 2], selector); + w[29] = __byte_perm_S (w[ 2], w[ 1], selector); + w[28] = __byte_perm_S (w[ 1], w[ 0], selector); + w[27] = __byte_perm_S (w[ 0], 0, selector); + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 28: + w[63] = __byte_perm_S (w[35], w[34], selector); + w[62] = __byte_perm_S (w[34], w[33], selector); + w[61] = __byte_perm_S (w[33], w[32], selector); + w[60] = __byte_perm_S (w[32], w[31], selector); + w[59] = __byte_perm_S (w[31], w[30], selector); + w[58] = __byte_perm_S (w[30], w[29], selector); + w[57] = __byte_perm_S (w[29], w[28], selector); + w[56] = __byte_perm_S (w[28], w[27], selector); + w[55] = __byte_perm_S (w[27], w[26], selector); + w[54] = __byte_perm_S (w[26], w[25], selector); + w[53] = __byte_perm_S (w[25], w[24], selector); + w[52] = __byte_perm_S (w[24], w[23], selector); + w[51] = __byte_perm_S (w[23], w[22], selector); + w[50] = __byte_perm_S (w[22], w[21], selector); + w[49] = __byte_perm_S (w[21], w[20], selector); + w[48] = __byte_perm_S (w[20], w[19], selector); + w[47] = __byte_perm_S (w[19], w[18], selector); + w[46] = __byte_perm_S (w[18], w[17], selector); + w[45] = __byte_perm_S (w[17], w[16], selector); + w[44] = __byte_perm_S (w[16], w[15], selector); + w[43] = __byte_perm_S (w[15], w[14], selector); + w[42] = __byte_perm_S (w[14], w[13], selector); + w[41] = __byte_perm_S (w[13], w[12], selector); + w[40] = __byte_perm_S (w[12], w[11], selector); + w[39] = __byte_perm_S (w[11], w[10], selector); + w[38] = __byte_perm_S (w[10], w[ 9], selector); + w[37] = __byte_perm_S (w[ 9], w[ 8], selector); + w[36] = __byte_perm_S (w[ 8], w[ 7], selector); + w[35] = __byte_perm_S (w[ 7], w[ 6], selector); + w[34] = __byte_perm_S (w[ 6], w[ 5], selector); + w[33] = __byte_perm_S (w[ 5], w[ 4], selector); + w[32] = __byte_perm_S (w[ 4], w[ 3], selector); + w[31] = __byte_perm_S (w[ 3], w[ 2], selector); + w[30] = __byte_perm_S (w[ 2], w[ 1], selector); + w[29] = __byte_perm_S (w[ 1], w[ 0], selector); + w[28] = __byte_perm_S (w[ 0], 0, selector); + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 29: + w[63] = __byte_perm_S (w[34], w[33], selector); + w[62] = __byte_perm_S (w[33], w[32], selector); + w[61] = __byte_perm_S (w[32], w[31], selector); + w[60] = __byte_perm_S (w[31], w[30], selector); + w[59] = __byte_perm_S (w[30], w[29], selector); + w[58] = __byte_perm_S (w[29], w[28], selector); + w[57] = __byte_perm_S (w[28], w[27], selector); + w[56] = __byte_perm_S (w[27], w[26], selector); + w[55] = __byte_perm_S (w[26], w[25], selector); + w[54] = __byte_perm_S (w[25], w[24], selector); + w[53] = __byte_perm_S (w[24], w[23], selector); + w[52] = __byte_perm_S (w[23], w[22], selector); + w[51] = __byte_perm_S (w[22], w[21], selector); + w[50] = __byte_perm_S (w[21], w[20], selector); + w[49] = __byte_perm_S (w[20], w[19], selector); + w[48] = __byte_perm_S (w[19], w[18], selector); + w[47] = __byte_perm_S (w[18], w[17], selector); + w[46] = __byte_perm_S (w[17], w[16], selector); + w[45] = __byte_perm_S (w[16], w[15], selector); + w[44] = __byte_perm_S (w[15], w[14], selector); + w[43] = __byte_perm_S (w[14], w[13], selector); + w[42] = __byte_perm_S (w[13], w[12], selector); + w[41] = __byte_perm_S (w[12], w[11], selector); + w[40] = __byte_perm_S (w[11], w[10], selector); + w[39] = __byte_perm_S (w[10], w[ 9], selector); + w[38] = __byte_perm_S (w[ 9], w[ 8], selector); + w[37] = __byte_perm_S (w[ 8], w[ 7], selector); + w[36] = __byte_perm_S (w[ 7], w[ 6], selector); + w[35] = __byte_perm_S (w[ 6], w[ 5], selector); + w[34] = __byte_perm_S (w[ 5], w[ 4], selector); + w[33] = __byte_perm_S (w[ 4], w[ 3], selector); + w[32] = __byte_perm_S (w[ 3], w[ 2], selector); + w[31] = __byte_perm_S (w[ 2], w[ 1], selector); + w[30] = __byte_perm_S (w[ 1], w[ 0], selector); + w[29] = __byte_perm_S (w[ 0], 0, selector); + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 30: + w[63] = __byte_perm_S (w[33], w[32], selector); + w[62] = __byte_perm_S (w[32], w[31], selector); + w[61] = __byte_perm_S (w[31], w[30], selector); + w[60] = __byte_perm_S (w[30], w[29], selector); + w[59] = __byte_perm_S (w[29], w[28], selector); + w[58] = __byte_perm_S (w[28], w[27], selector); + w[57] = __byte_perm_S (w[27], w[26], selector); + w[56] = __byte_perm_S (w[26], w[25], selector); + w[55] = __byte_perm_S (w[25], w[24], selector); + w[54] = __byte_perm_S (w[24], w[23], selector); + w[53] = __byte_perm_S (w[23], w[22], selector); + w[52] = __byte_perm_S (w[22], w[21], selector); + w[51] = __byte_perm_S (w[21], w[20], selector); + w[50] = __byte_perm_S (w[20], w[19], selector); + w[49] = __byte_perm_S (w[19], w[18], selector); + w[48] = __byte_perm_S (w[18], w[17], selector); + w[47] = __byte_perm_S (w[17], w[16], selector); + w[46] = __byte_perm_S (w[16], w[15], selector); + w[45] = __byte_perm_S (w[15], w[14], selector); + w[44] = __byte_perm_S (w[14], w[13], selector); + w[43] = __byte_perm_S (w[13], w[12], selector); + w[42] = __byte_perm_S (w[12], w[11], selector); + w[41] = __byte_perm_S (w[11], w[10], selector); + w[40] = __byte_perm_S (w[10], w[ 9], selector); + w[39] = __byte_perm_S (w[ 9], w[ 8], selector); + w[38] = __byte_perm_S (w[ 8], w[ 7], selector); + w[37] = __byte_perm_S (w[ 7], w[ 6], selector); + w[36] = __byte_perm_S (w[ 6], w[ 5], selector); + w[35] = __byte_perm_S (w[ 5], w[ 4], selector); + w[34] = __byte_perm_S (w[ 4], w[ 3], selector); + w[33] = __byte_perm_S (w[ 3], w[ 2], selector); + w[32] = __byte_perm_S (w[ 2], w[ 1], selector); + w[31] = __byte_perm_S (w[ 1], w[ 0], selector); + w[30] = __byte_perm_S (w[ 0], 0, selector); + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 31: + w[63] = __byte_perm_S (w[32], w[31], selector); + w[62] = __byte_perm_S (w[31], w[30], selector); + w[61] = __byte_perm_S (w[30], w[29], selector); + w[60] = __byte_perm_S (w[29], w[28], selector); + w[59] = __byte_perm_S (w[28], w[27], selector); + w[58] = __byte_perm_S (w[27], w[26], selector); + w[57] = __byte_perm_S (w[26], w[25], selector); + w[56] = __byte_perm_S (w[25], w[24], selector); + w[55] = __byte_perm_S (w[24], w[23], selector); + w[54] = __byte_perm_S (w[23], w[22], selector); + w[53] = __byte_perm_S (w[22], w[21], selector); + w[52] = __byte_perm_S (w[21], w[20], selector); + w[51] = __byte_perm_S (w[20], w[19], selector); + w[50] = __byte_perm_S (w[19], w[18], selector); + w[49] = __byte_perm_S (w[18], w[17], selector); + w[48] = __byte_perm_S (w[17], w[16], selector); + w[47] = __byte_perm_S (w[16], w[15], selector); + w[46] = __byte_perm_S (w[15], w[14], selector); + w[45] = __byte_perm_S (w[14], w[13], selector); + w[44] = __byte_perm_S (w[13], w[12], selector); + w[43] = __byte_perm_S (w[12], w[11], selector); + w[42] = __byte_perm_S (w[11], w[10], selector); + w[41] = __byte_perm_S (w[10], w[ 9], selector); + w[40] = __byte_perm_S (w[ 9], w[ 8], selector); + w[39] = __byte_perm_S (w[ 8], w[ 7], selector); + w[38] = __byte_perm_S (w[ 7], w[ 6], selector); + w[37] = __byte_perm_S (w[ 6], w[ 5], selector); + w[36] = __byte_perm_S (w[ 5], w[ 4], selector); + w[35] = __byte_perm_S (w[ 4], w[ 3], selector); + w[34] = __byte_perm_S (w[ 3], w[ 2], selector); + w[33] = __byte_perm_S (w[ 2], w[ 1], selector); + w[32] = __byte_perm_S (w[ 1], w[ 0], selector); + w[31] = __byte_perm_S (w[ 0], 0, selector); + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 32: + w[63] = __byte_perm_S (w[31], w[30], selector); + w[62] = __byte_perm_S (w[30], w[29], selector); + w[61] = __byte_perm_S (w[29], w[28], selector); + w[60] = __byte_perm_S (w[28], w[27], selector); + w[59] = __byte_perm_S (w[27], w[26], selector); + w[58] = __byte_perm_S (w[26], w[25], selector); + w[57] = __byte_perm_S (w[25], w[24], selector); + w[56] = __byte_perm_S (w[24], w[23], selector); + w[55] = __byte_perm_S (w[23], w[22], selector); + w[54] = __byte_perm_S (w[22], w[21], selector); + w[53] = __byte_perm_S (w[21], w[20], selector); + w[52] = __byte_perm_S (w[20], w[19], selector); + w[51] = __byte_perm_S (w[19], w[18], selector); + w[50] = __byte_perm_S (w[18], w[17], selector); + w[49] = __byte_perm_S (w[17], w[16], selector); + w[48] = __byte_perm_S (w[16], w[15], selector); + w[47] = __byte_perm_S (w[15], w[14], selector); + w[46] = __byte_perm_S (w[14], w[13], selector); + w[45] = __byte_perm_S (w[13], w[12], selector); + w[44] = __byte_perm_S (w[12], w[11], selector); + w[43] = __byte_perm_S (w[11], w[10], selector); + w[42] = __byte_perm_S (w[10], w[ 9], selector); + w[41] = __byte_perm_S (w[ 9], w[ 8], selector); + w[40] = __byte_perm_S (w[ 8], w[ 7], selector); + w[39] = __byte_perm_S (w[ 7], w[ 6], selector); + w[38] = __byte_perm_S (w[ 6], w[ 5], selector); + w[37] = __byte_perm_S (w[ 5], w[ 4], selector); + w[36] = __byte_perm_S (w[ 4], w[ 3], selector); + w[35] = __byte_perm_S (w[ 3], w[ 2], selector); + w[34] = __byte_perm_S (w[ 2], w[ 1], selector); + w[33] = __byte_perm_S (w[ 1], w[ 0], selector); + w[32] = __byte_perm_S (w[ 0], 0, selector); + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 33: + w[63] = __byte_perm_S (w[30], w[29], selector); + w[62] = __byte_perm_S (w[29], w[28], selector); + w[61] = __byte_perm_S (w[28], w[27], selector); + w[60] = __byte_perm_S (w[27], w[26], selector); + w[59] = __byte_perm_S (w[26], w[25], selector); + w[58] = __byte_perm_S (w[25], w[24], selector); + w[57] = __byte_perm_S (w[24], w[23], selector); + w[56] = __byte_perm_S (w[23], w[22], selector); + w[55] = __byte_perm_S (w[22], w[21], selector); + w[54] = __byte_perm_S (w[21], w[20], selector); + w[53] = __byte_perm_S (w[20], w[19], selector); + w[52] = __byte_perm_S (w[19], w[18], selector); + w[51] = __byte_perm_S (w[18], w[17], selector); + w[50] = __byte_perm_S (w[17], w[16], selector); + w[49] = __byte_perm_S (w[16], w[15], selector); + w[48] = __byte_perm_S (w[15], w[14], selector); + w[47] = __byte_perm_S (w[14], w[13], selector); + w[46] = __byte_perm_S (w[13], w[12], selector); + w[45] = __byte_perm_S (w[12], w[11], selector); + w[44] = __byte_perm_S (w[11], w[10], selector); + w[43] = __byte_perm_S (w[10], w[ 9], selector); + w[42] = __byte_perm_S (w[ 9], w[ 8], selector); + w[41] = __byte_perm_S (w[ 8], w[ 7], selector); + w[40] = __byte_perm_S (w[ 7], w[ 6], selector); + w[39] = __byte_perm_S (w[ 6], w[ 5], selector); + w[38] = __byte_perm_S (w[ 5], w[ 4], selector); + w[37] = __byte_perm_S (w[ 4], w[ 3], selector); + w[36] = __byte_perm_S (w[ 3], w[ 2], selector); + w[35] = __byte_perm_S (w[ 2], w[ 1], selector); + w[34] = __byte_perm_S (w[ 1], w[ 0], selector); + w[33] = __byte_perm_S (w[ 0], 0, selector); + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 34: + w[63] = __byte_perm_S (w[29], w[28], selector); + w[62] = __byte_perm_S (w[28], w[27], selector); + w[61] = __byte_perm_S (w[27], w[26], selector); + w[60] = __byte_perm_S (w[26], w[25], selector); + w[59] = __byte_perm_S (w[25], w[24], selector); + w[58] = __byte_perm_S (w[24], w[23], selector); + w[57] = __byte_perm_S (w[23], w[22], selector); + w[56] = __byte_perm_S (w[22], w[21], selector); + w[55] = __byte_perm_S (w[21], w[20], selector); + w[54] = __byte_perm_S (w[20], w[19], selector); + w[53] = __byte_perm_S (w[19], w[18], selector); + w[52] = __byte_perm_S (w[18], w[17], selector); + w[51] = __byte_perm_S (w[17], w[16], selector); + w[50] = __byte_perm_S (w[16], w[15], selector); + w[49] = __byte_perm_S (w[15], w[14], selector); + w[48] = __byte_perm_S (w[14], w[13], selector); + w[47] = __byte_perm_S (w[13], w[12], selector); + w[46] = __byte_perm_S (w[12], w[11], selector); + w[45] = __byte_perm_S (w[11], w[10], selector); + w[44] = __byte_perm_S (w[10], w[ 9], selector); + w[43] = __byte_perm_S (w[ 9], w[ 8], selector); + w[42] = __byte_perm_S (w[ 8], w[ 7], selector); + w[41] = __byte_perm_S (w[ 7], w[ 6], selector); + w[40] = __byte_perm_S (w[ 6], w[ 5], selector); + w[39] = __byte_perm_S (w[ 5], w[ 4], selector); + w[38] = __byte_perm_S (w[ 4], w[ 3], selector); + w[37] = __byte_perm_S (w[ 3], w[ 2], selector); + w[36] = __byte_perm_S (w[ 2], w[ 1], selector); + w[35] = __byte_perm_S (w[ 1], w[ 0], selector); + w[34] = __byte_perm_S (w[ 0], 0, selector); + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 35: + w[63] = __byte_perm_S (w[28], w[27], selector); + w[62] = __byte_perm_S (w[27], w[26], selector); + w[61] = __byte_perm_S (w[26], w[25], selector); + w[60] = __byte_perm_S (w[25], w[24], selector); + w[59] = __byte_perm_S (w[24], w[23], selector); + w[58] = __byte_perm_S (w[23], w[22], selector); + w[57] = __byte_perm_S (w[22], w[21], selector); + w[56] = __byte_perm_S (w[21], w[20], selector); + w[55] = __byte_perm_S (w[20], w[19], selector); + w[54] = __byte_perm_S (w[19], w[18], selector); + w[53] = __byte_perm_S (w[18], w[17], selector); + w[52] = __byte_perm_S (w[17], w[16], selector); + w[51] = __byte_perm_S (w[16], w[15], selector); + w[50] = __byte_perm_S (w[15], w[14], selector); + w[49] = __byte_perm_S (w[14], w[13], selector); + w[48] = __byte_perm_S (w[13], w[12], selector); + w[47] = __byte_perm_S (w[12], w[11], selector); + w[46] = __byte_perm_S (w[11], w[10], selector); + w[45] = __byte_perm_S (w[10], w[ 9], selector); + w[44] = __byte_perm_S (w[ 9], w[ 8], selector); + w[43] = __byte_perm_S (w[ 8], w[ 7], selector); + w[42] = __byte_perm_S (w[ 7], w[ 6], selector); + w[41] = __byte_perm_S (w[ 6], w[ 5], selector); + w[40] = __byte_perm_S (w[ 5], w[ 4], selector); + w[39] = __byte_perm_S (w[ 4], w[ 3], selector); + w[38] = __byte_perm_S (w[ 3], w[ 2], selector); + w[37] = __byte_perm_S (w[ 2], w[ 1], selector); + w[36] = __byte_perm_S (w[ 1], w[ 0], selector); + w[35] = __byte_perm_S (w[ 0], 0, selector); + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 36: + w[63] = __byte_perm_S (w[27], w[26], selector); + w[62] = __byte_perm_S (w[26], w[25], selector); + w[61] = __byte_perm_S (w[25], w[24], selector); + w[60] = __byte_perm_S (w[24], w[23], selector); + w[59] = __byte_perm_S (w[23], w[22], selector); + w[58] = __byte_perm_S (w[22], w[21], selector); + w[57] = __byte_perm_S (w[21], w[20], selector); + w[56] = __byte_perm_S (w[20], w[19], selector); + w[55] = __byte_perm_S (w[19], w[18], selector); + w[54] = __byte_perm_S (w[18], w[17], selector); + w[53] = __byte_perm_S (w[17], w[16], selector); + w[52] = __byte_perm_S (w[16], w[15], selector); + w[51] = __byte_perm_S (w[15], w[14], selector); + w[50] = __byte_perm_S (w[14], w[13], selector); + w[49] = __byte_perm_S (w[13], w[12], selector); + w[48] = __byte_perm_S (w[12], w[11], selector); + w[47] = __byte_perm_S (w[11], w[10], selector); + w[46] = __byte_perm_S (w[10], w[ 9], selector); + w[45] = __byte_perm_S (w[ 9], w[ 8], selector); + w[44] = __byte_perm_S (w[ 8], w[ 7], selector); + w[43] = __byte_perm_S (w[ 7], w[ 6], selector); + w[42] = __byte_perm_S (w[ 6], w[ 5], selector); + w[41] = __byte_perm_S (w[ 5], w[ 4], selector); + w[40] = __byte_perm_S (w[ 4], w[ 3], selector); + w[39] = __byte_perm_S (w[ 3], w[ 2], selector); + w[38] = __byte_perm_S (w[ 2], w[ 1], selector); + w[37] = __byte_perm_S (w[ 1], w[ 0], selector); + w[36] = __byte_perm_S (w[ 0], 0, selector); + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 37: + w[63] = __byte_perm_S (w[26], w[25], selector); + w[62] = __byte_perm_S (w[25], w[24], selector); + w[61] = __byte_perm_S (w[24], w[23], selector); + w[60] = __byte_perm_S (w[23], w[22], selector); + w[59] = __byte_perm_S (w[22], w[21], selector); + w[58] = __byte_perm_S (w[21], w[20], selector); + w[57] = __byte_perm_S (w[20], w[19], selector); + w[56] = __byte_perm_S (w[19], w[18], selector); + w[55] = __byte_perm_S (w[18], w[17], selector); + w[54] = __byte_perm_S (w[17], w[16], selector); + w[53] = __byte_perm_S (w[16], w[15], selector); + w[52] = __byte_perm_S (w[15], w[14], selector); + w[51] = __byte_perm_S (w[14], w[13], selector); + w[50] = __byte_perm_S (w[13], w[12], selector); + w[49] = __byte_perm_S (w[12], w[11], selector); + w[48] = __byte_perm_S (w[11], w[10], selector); + w[47] = __byte_perm_S (w[10], w[ 9], selector); + w[46] = __byte_perm_S (w[ 9], w[ 8], selector); + w[45] = __byte_perm_S (w[ 8], w[ 7], selector); + w[44] = __byte_perm_S (w[ 7], w[ 6], selector); + w[43] = __byte_perm_S (w[ 6], w[ 5], selector); + w[42] = __byte_perm_S (w[ 5], w[ 4], selector); + w[41] = __byte_perm_S (w[ 4], w[ 3], selector); + w[40] = __byte_perm_S (w[ 3], w[ 2], selector); + w[39] = __byte_perm_S (w[ 2], w[ 1], selector); + w[38] = __byte_perm_S (w[ 1], w[ 0], selector); + w[37] = __byte_perm_S (w[ 0], 0, selector); + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 38: + w[63] = __byte_perm_S (w[25], w[24], selector); + w[62] = __byte_perm_S (w[24], w[23], selector); + w[61] = __byte_perm_S (w[23], w[22], selector); + w[60] = __byte_perm_S (w[22], w[21], selector); + w[59] = __byte_perm_S (w[21], w[20], selector); + w[58] = __byte_perm_S (w[20], w[19], selector); + w[57] = __byte_perm_S (w[19], w[18], selector); + w[56] = __byte_perm_S (w[18], w[17], selector); + w[55] = __byte_perm_S (w[17], w[16], selector); + w[54] = __byte_perm_S (w[16], w[15], selector); + w[53] = __byte_perm_S (w[15], w[14], selector); + w[52] = __byte_perm_S (w[14], w[13], selector); + w[51] = __byte_perm_S (w[13], w[12], selector); + w[50] = __byte_perm_S (w[12], w[11], selector); + w[49] = __byte_perm_S (w[11], w[10], selector); + w[48] = __byte_perm_S (w[10], w[ 9], selector); + w[47] = __byte_perm_S (w[ 9], w[ 8], selector); + w[46] = __byte_perm_S (w[ 8], w[ 7], selector); + w[45] = __byte_perm_S (w[ 7], w[ 6], selector); + w[44] = __byte_perm_S (w[ 6], w[ 5], selector); + w[43] = __byte_perm_S (w[ 5], w[ 4], selector); + w[42] = __byte_perm_S (w[ 4], w[ 3], selector); + w[41] = __byte_perm_S (w[ 3], w[ 2], selector); + w[40] = __byte_perm_S (w[ 2], w[ 1], selector); + w[39] = __byte_perm_S (w[ 1], w[ 0], selector); + w[38] = __byte_perm_S (w[ 0], 0, selector); + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 39: + w[63] = __byte_perm_S (w[24], w[23], selector); + w[62] = __byte_perm_S (w[23], w[22], selector); + w[61] = __byte_perm_S (w[22], w[21], selector); + w[60] = __byte_perm_S (w[21], w[20], selector); + w[59] = __byte_perm_S (w[20], w[19], selector); + w[58] = __byte_perm_S (w[19], w[18], selector); + w[57] = __byte_perm_S (w[18], w[17], selector); + w[56] = __byte_perm_S (w[17], w[16], selector); + w[55] = __byte_perm_S (w[16], w[15], selector); + w[54] = __byte_perm_S (w[15], w[14], selector); + w[53] = __byte_perm_S (w[14], w[13], selector); + w[52] = __byte_perm_S (w[13], w[12], selector); + w[51] = __byte_perm_S (w[12], w[11], selector); + w[50] = __byte_perm_S (w[11], w[10], selector); + w[49] = __byte_perm_S (w[10], w[ 9], selector); + w[48] = __byte_perm_S (w[ 9], w[ 8], selector); + w[47] = __byte_perm_S (w[ 8], w[ 7], selector); + w[46] = __byte_perm_S (w[ 7], w[ 6], selector); + w[45] = __byte_perm_S (w[ 6], w[ 5], selector); + w[44] = __byte_perm_S (w[ 5], w[ 4], selector); + w[43] = __byte_perm_S (w[ 4], w[ 3], selector); + w[42] = __byte_perm_S (w[ 3], w[ 2], selector); + w[41] = __byte_perm_S (w[ 2], w[ 1], selector); + w[40] = __byte_perm_S (w[ 1], w[ 0], selector); + w[39] = __byte_perm_S (w[ 0], 0, selector); + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 40: + w[63] = __byte_perm_S (w[23], w[22], selector); + w[62] = __byte_perm_S (w[22], w[21], selector); + w[61] = __byte_perm_S (w[21], w[20], selector); + w[60] = __byte_perm_S (w[20], w[19], selector); + w[59] = __byte_perm_S (w[19], w[18], selector); + w[58] = __byte_perm_S (w[18], w[17], selector); + w[57] = __byte_perm_S (w[17], w[16], selector); + w[56] = __byte_perm_S (w[16], w[15], selector); + w[55] = __byte_perm_S (w[15], w[14], selector); + w[54] = __byte_perm_S (w[14], w[13], selector); + w[53] = __byte_perm_S (w[13], w[12], selector); + w[52] = __byte_perm_S (w[12], w[11], selector); + w[51] = __byte_perm_S (w[11], w[10], selector); + w[50] = __byte_perm_S (w[10], w[ 9], selector); + w[49] = __byte_perm_S (w[ 9], w[ 8], selector); + w[48] = __byte_perm_S (w[ 8], w[ 7], selector); + w[47] = __byte_perm_S (w[ 7], w[ 6], selector); + w[46] = __byte_perm_S (w[ 6], w[ 5], selector); + w[45] = __byte_perm_S (w[ 5], w[ 4], selector); + w[44] = __byte_perm_S (w[ 4], w[ 3], selector); + w[43] = __byte_perm_S (w[ 3], w[ 2], selector); + w[42] = __byte_perm_S (w[ 2], w[ 1], selector); + w[41] = __byte_perm_S (w[ 1], w[ 0], selector); + w[40] = __byte_perm_S (w[ 0], 0, selector); + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 41: + w[63] = __byte_perm_S (w[22], w[21], selector); + w[62] = __byte_perm_S (w[21], w[20], selector); + w[61] = __byte_perm_S (w[20], w[19], selector); + w[60] = __byte_perm_S (w[19], w[18], selector); + w[59] = __byte_perm_S (w[18], w[17], selector); + w[58] = __byte_perm_S (w[17], w[16], selector); + w[57] = __byte_perm_S (w[16], w[15], selector); + w[56] = __byte_perm_S (w[15], w[14], selector); + w[55] = __byte_perm_S (w[14], w[13], selector); + w[54] = __byte_perm_S (w[13], w[12], selector); + w[53] = __byte_perm_S (w[12], w[11], selector); + w[52] = __byte_perm_S (w[11], w[10], selector); + w[51] = __byte_perm_S (w[10], w[ 9], selector); + w[50] = __byte_perm_S (w[ 9], w[ 8], selector); + w[49] = __byte_perm_S (w[ 8], w[ 7], selector); + w[48] = __byte_perm_S (w[ 7], w[ 6], selector); + w[47] = __byte_perm_S (w[ 6], w[ 5], selector); + w[46] = __byte_perm_S (w[ 5], w[ 4], selector); + w[45] = __byte_perm_S (w[ 4], w[ 3], selector); + w[44] = __byte_perm_S (w[ 3], w[ 2], selector); + w[43] = __byte_perm_S (w[ 2], w[ 1], selector); + w[42] = __byte_perm_S (w[ 1], w[ 0], selector); + w[41] = __byte_perm_S (w[ 0], 0, selector); + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 42: + w[63] = __byte_perm_S (w[21], w[20], selector); + w[62] = __byte_perm_S (w[20], w[19], selector); + w[61] = __byte_perm_S (w[19], w[18], selector); + w[60] = __byte_perm_S (w[18], w[17], selector); + w[59] = __byte_perm_S (w[17], w[16], selector); + w[58] = __byte_perm_S (w[16], w[15], selector); + w[57] = __byte_perm_S (w[15], w[14], selector); + w[56] = __byte_perm_S (w[14], w[13], selector); + w[55] = __byte_perm_S (w[13], w[12], selector); + w[54] = __byte_perm_S (w[12], w[11], selector); + w[53] = __byte_perm_S (w[11], w[10], selector); + w[52] = __byte_perm_S (w[10], w[ 9], selector); + w[51] = __byte_perm_S (w[ 9], w[ 8], selector); + w[50] = __byte_perm_S (w[ 8], w[ 7], selector); + w[49] = __byte_perm_S (w[ 7], w[ 6], selector); + w[48] = __byte_perm_S (w[ 6], w[ 5], selector); + w[47] = __byte_perm_S (w[ 5], w[ 4], selector); + w[46] = __byte_perm_S (w[ 4], w[ 3], selector); + w[45] = __byte_perm_S (w[ 3], w[ 2], selector); + w[44] = __byte_perm_S (w[ 2], w[ 1], selector); + w[43] = __byte_perm_S (w[ 1], w[ 0], selector); + w[42] = __byte_perm_S (w[ 0], 0, selector); + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 43: + w[63] = __byte_perm_S (w[20], w[19], selector); + w[62] = __byte_perm_S (w[19], w[18], selector); + w[61] = __byte_perm_S (w[18], w[17], selector); + w[60] = __byte_perm_S (w[17], w[16], selector); + w[59] = __byte_perm_S (w[16], w[15], selector); + w[58] = __byte_perm_S (w[15], w[14], selector); + w[57] = __byte_perm_S (w[14], w[13], selector); + w[56] = __byte_perm_S (w[13], w[12], selector); + w[55] = __byte_perm_S (w[12], w[11], selector); + w[54] = __byte_perm_S (w[11], w[10], selector); + w[53] = __byte_perm_S (w[10], w[ 9], selector); + w[52] = __byte_perm_S (w[ 9], w[ 8], selector); + w[51] = __byte_perm_S (w[ 8], w[ 7], selector); + w[50] = __byte_perm_S (w[ 7], w[ 6], selector); + w[49] = __byte_perm_S (w[ 6], w[ 5], selector); + w[48] = __byte_perm_S (w[ 5], w[ 4], selector); + w[47] = __byte_perm_S (w[ 4], w[ 3], selector); + w[46] = __byte_perm_S (w[ 3], w[ 2], selector); + w[45] = __byte_perm_S (w[ 2], w[ 1], selector); + w[44] = __byte_perm_S (w[ 1], w[ 0], selector); + w[43] = __byte_perm_S (w[ 0], 0, selector); + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 44: + w[63] = __byte_perm_S (w[19], w[18], selector); + w[62] = __byte_perm_S (w[18], w[17], selector); + w[61] = __byte_perm_S (w[17], w[16], selector); + w[60] = __byte_perm_S (w[16], w[15], selector); + w[59] = __byte_perm_S (w[15], w[14], selector); + w[58] = __byte_perm_S (w[14], w[13], selector); + w[57] = __byte_perm_S (w[13], w[12], selector); + w[56] = __byte_perm_S (w[12], w[11], selector); + w[55] = __byte_perm_S (w[11], w[10], selector); + w[54] = __byte_perm_S (w[10], w[ 9], selector); + w[53] = __byte_perm_S (w[ 9], w[ 8], selector); + w[52] = __byte_perm_S (w[ 8], w[ 7], selector); + w[51] = __byte_perm_S (w[ 7], w[ 6], selector); + w[50] = __byte_perm_S (w[ 6], w[ 5], selector); + w[49] = __byte_perm_S (w[ 5], w[ 4], selector); + w[48] = __byte_perm_S (w[ 4], w[ 3], selector); + w[47] = __byte_perm_S (w[ 3], w[ 2], selector); + w[46] = __byte_perm_S (w[ 2], w[ 1], selector); + w[45] = __byte_perm_S (w[ 1], w[ 0], selector); + w[44] = __byte_perm_S (w[ 0], 0, selector); + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 45: + w[63] = __byte_perm_S (w[18], w[17], selector); + w[62] = __byte_perm_S (w[17], w[16], selector); + w[61] = __byte_perm_S (w[16], w[15], selector); + w[60] = __byte_perm_S (w[15], w[14], selector); + w[59] = __byte_perm_S (w[14], w[13], selector); + w[58] = __byte_perm_S (w[13], w[12], selector); + w[57] = __byte_perm_S (w[12], w[11], selector); + w[56] = __byte_perm_S (w[11], w[10], selector); + w[55] = __byte_perm_S (w[10], w[ 9], selector); + w[54] = __byte_perm_S (w[ 9], w[ 8], selector); + w[53] = __byte_perm_S (w[ 8], w[ 7], selector); + w[52] = __byte_perm_S (w[ 7], w[ 6], selector); + w[51] = __byte_perm_S (w[ 6], w[ 5], selector); + w[50] = __byte_perm_S (w[ 5], w[ 4], selector); + w[49] = __byte_perm_S (w[ 4], w[ 3], selector); + w[48] = __byte_perm_S (w[ 3], w[ 2], selector); + w[47] = __byte_perm_S (w[ 2], w[ 1], selector); + w[46] = __byte_perm_S (w[ 1], w[ 0], selector); + w[45] = __byte_perm_S (w[ 0], 0, selector); + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 46: + w[63] = __byte_perm_S (w[17], w[16], selector); + w[62] = __byte_perm_S (w[16], w[15], selector); + w[61] = __byte_perm_S (w[15], w[14], selector); + w[60] = __byte_perm_S (w[14], w[13], selector); + w[59] = __byte_perm_S (w[13], w[12], selector); + w[58] = __byte_perm_S (w[12], w[11], selector); + w[57] = __byte_perm_S (w[11], w[10], selector); + w[56] = __byte_perm_S (w[10], w[ 9], selector); + w[55] = __byte_perm_S (w[ 9], w[ 8], selector); + w[54] = __byte_perm_S (w[ 8], w[ 7], selector); + w[53] = __byte_perm_S (w[ 7], w[ 6], selector); + w[52] = __byte_perm_S (w[ 6], w[ 5], selector); + w[51] = __byte_perm_S (w[ 5], w[ 4], selector); + w[50] = __byte_perm_S (w[ 4], w[ 3], selector); + w[49] = __byte_perm_S (w[ 3], w[ 2], selector); + w[48] = __byte_perm_S (w[ 2], w[ 1], selector); + w[47] = __byte_perm_S (w[ 1], w[ 0], selector); + w[46] = __byte_perm_S (w[ 0], 0, selector); + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 47: + w[63] = __byte_perm_S (w[16], w[15], selector); + w[62] = __byte_perm_S (w[15], w[14], selector); + w[61] = __byte_perm_S (w[14], w[13], selector); + w[60] = __byte_perm_S (w[13], w[12], selector); + w[59] = __byte_perm_S (w[12], w[11], selector); + w[58] = __byte_perm_S (w[11], w[10], selector); + w[57] = __byte_perm_S (w[10], w[ 9], selector); + w[56] = __byte_perm_S (w[ 9], w[ 8], selector); + w[55] = __byte_perm_S (w[ 8], w[ 7], selector); + w[54] = __byte_perm_S (w[ 7], w[ 6], selector); + w[53] = __byte_perm_S (w[ 6], w[ 5], selector); + w[52] = __byte_perm_S (w[ 5], w[ 4], selector); + w[51] = __byte_perm_S (w[ 4], w[ 3], selector); + w[50] = __byte_perm_S (w[ 3], w[ 2], selector); + w[49] = __byte_perm_S (w[ 2], w[ 1], selector); + w[48] = __byte_perm_S (w[ 1], w[ 0], selector); + w[47] = __byte_perm_S (w[ 0], 0, selector); + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 48: + w[63] = __byte_perm_S (w[15], w[14], selector); + w[62] = __byte_perm_S (w[14], w[13], selector); + w[61] = __byte_perm_S (w[13], w[12], selector); + w[60] = __byte_perm_S (w[12], w[11], selector); + w[59] = __byte_perm_S (w[11], w[10], selector); + w[58] = __byte_perm_S (w[10], w[ 9], selector); + w[57] = __byte_perm_S (w[ 9], w[ 8], selector); + w[56] = __byte_perm_S (w[ 8], w[ 7], selector); + w[55] = __byte_perm_S (w[ 7], w[ 6], selector); + w[54] = __byte_perm_S (w[ 6], w[ 5], selector); + w[53] = __byte_perm_S (w[ 5], w[ 4], selector); + w[52] = __byte_perm_S (w[ 4], w[ 3], selector); + w[51] = __byte_perm_S (w[ 3], w[ 2], selector); + w[50] = __byte_perm_S (w[ 2], w[ 1], selector); + w[49] = __byte_perm_S (w[ 1], w[ 0], selector); + w[48] = __byte_perm_S (w[ 0], 0, selector); + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 49: + w[63] = __byte_perm_S (w[14], w[13], selector); + w[62] = __byte_perm_S (w[13], w[12], selector); + w[61] = __byte_perm_S (w[12], w[11], selector); + w[60] = __byte_perm_S (w[11], w[10], selector); + w[59] = __byte_perm_S (w[10], w[ 9], selector); + w[58] = __byte_perm_S (w[ 9], w[ 8], selector); + w[57] = __byte_perm_S (w[ 8], w[ 7], selector); + w[56] = __byte_perm_S (w[ 7], w[ 6], selector); + w[55] = __byte_perm_S (w[ 6], w[ 5], selector); + w[54] = __byte_perm_S (w[ 5], w[ 4], selector); + w[53] = __byte_perm_S (w[ 4], w[ 3], selector); + w[52] = __byte_perm_S (w[ 3], w[ 2], selector); + w[51] = __byte_perm_S (w[ 2], w[ 1], selector); + w[50] = __byte_perm_S (w[ 1], w[ 0], selector); + w[49] = __byte_perm_S (w[ 0], 0, selector); + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 50: + w[63] = __byte_perm_S (w[13], w[12], selector); + w[62] = __byte_perm_S (w[12], w[11], selector); + w[61] = __byte_perm_S (w[11], w[10], selector); + w[60] = __byte_perm_S (w[10], w[ 9], selector); + w[59] = __byte_perm_S (w[ 9], w[ 8], selector); + w[58] = __byte_perm_S (w[ 8], w[ 7], selector); + w[57] = __byte_perm_S (w[ 7], w[ 6], selector); + w[56] = __byte_perm_S (w[ 6], w[ 5], selector); + w[55] = __byte_perm_S (w[ 5], w[ 4], selector); + w[54] = __byte_perm_S (w[ 4], w[ 3], selector); + w[53] = __byte_perm_S (w[ 3], w[ 2], selector); + w[52] = __byte_perm_S (w[ 2], w[ 1], selector); + w[51] = __byte_perm_S (w[ 1], w[ 0], selector); + w[50] = __byte_perm_S (w[ 0], 0, selector); + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 51: + w[63] = __byte_perm_S (w[12], w[11], selector); + w[62] = __byte_perm_S (w[11], w[10], selector); + w[61] = __byte_perm_S (w[10], w[ 9], selector); + w[60] = __byte_perm_S (w[ 9], w[ 8], selector); + w[59] = __byte_perm_S (w[ 8], w[ 7], selector); + w[58] = __byte_perm_S (w[ 7], w[ 6], selector); + w[57] = __byte_perm_S (w[ 6], w[ 5], selector); + w[56] = __byte_perm_S (w[ 5], w[ 4], selector); + w[55] = __byte_perm_S (w[ 4], w[ 3], selector); + w[54] = __byte_perm_S (w[ 3], w[ 2], selector); + w[53] = __byte_perm_S (w[ 2], w[ 1], selector); + w[52] = __byte_perm_S (w[ 1], w[ 0], selector); + w[51] = __byte_perm_S (w[ 0], 0, selector); + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 52: + w[63] = __byte_perm_S (w[11], w[10], selector); + w[62] = __byte_perm_S (w[10], w[ 9], selector); + w[61] = __byte_perm_S (w[ 9], w[ 8], selector); + w[60] = __byte_perm_S (w[ 8], w[ 7], selector); + w[59] = __byte_perm_S (w[ 7], w[ 6], selector); + w[58] = __byte_perm_S (w[ 6], w[ 5], selector); + w[57] = __byte_perm_S (w[ 5], w[ 4], selector); + w[56] = __byte_perm_S (w[ 4], w[ 3], selector); + w[55] = __byte_perm_S (w[ 3], w[ 2], selector); + w[54] = __byte_perm_S (w[ 2], w[ 1], selector); + w[53] = __byte_perm_S (w[ 1], w[ 0], selector); + w[52] = __byte_perm_S (w[ 0], 0, selector); + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 53: + w[63] = __byte_perm_S (w[10], w[ 9], selector); + w[62] = __byte_perm_S (w[ 9], w[ 8], selector); + w[61] = __byte_perm_S (w[ 8], w[ 7], selector); + w[60] = __byte_perm_S (w[ 7], w[ 6], selector); + w[59] = __byte_perm_S (w[ 6], w[ 5], selector); + w[58] = __byte_perm_S (w[ 5], w[ 4], selector); + w[57] = __byte_perm_S (w[ 4], w[ 3], selector); + w[56] = __byte_perm_S (w[ 3], w[ 2], selector); + w[55] = __byte_perm_S (w[ 2], w[ 1], selector); + w[54] = __byte_perm_S (w[ 1], w[ 0], selector); + w[53] = __byte_perm_S (w[ 0], 0, selector); + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 54: + w[63] = __byte_perm_S (w[ 9], w[ 8], selector); + w[62] = __byte_perm_S (w[ 8], w[ 7], selector); + w[61] = __byte_perm_S (w[ 7], w[ 6], selector); + w[60] = __byte_perm_S (w[ 6], w[ 5], selector); + w[59] = __byte_perm_S (w[ 5], w[ 4], selector); + w[58] = __byte_perm_S (w[ 4], w[ 3], selector); + w[57] = __byte_perm_S (w[ 3], w[ 2], selector); + w[56] = __byte_perm_S (w[ 2], w[ 1], selector); + w[55] = __byte_perm_S (w[ 1], w[ 0], selector); + w[54] = __byte_perm_S (w[ 0], 0, selector); + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 55: + w[63] = __byte_perm_S (w[ 8], w[ 7], selector); + w[62] = __byte_perm_S (w[ 7], w[ 6], selector); + w[61] = __byte_perm_S (w[ 6], w[ 5], selector); + w[60] = __byte_perm_S (w[ 5], w[ 4], selector); + w[59] = __byte_perm_S (w[ 4], w[ 3], selector); + w[58] = __byte_perm_S (w[ 3], w[ 2], selector); + w[57] = __byte_perm_S (w[ 2], w[ 1], selector); + w[56] = __byte_perm_S (w[ 1], w[ 0], selector); + w[55] = __byte_perm_S (w[ 0], 0, selector); + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 56: + w[63] = __byte_perm_S (w[ 7], w[ 6], selector); + w[62] = __byte_perm_S (w[ 6], w[ 5], selector); + w[61] = __byte_perm_S (w[ 5], w[ 4], selector); + w[60] = __byte_perm_S (w[ 4], w[ 3], selector); + w[59] = __byte_perm_S (w[ 3], w[ 2], selector); + w[58] = __byte_perm_S (w[ 2], w[ 1], selector); + w[57] = __byte_perm_S (w[ 1], w[ 0], selector); + w[56] = __byte_perm_S (w[ 0], 0, selector); + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 57: + w[63] = __byte_perm_S (w[ 6], w[ 5], selector); + w[62] = __byte_perm_S (w[ 5], w[ 4], selector); + w[61] = __byte_perm_S (w[ 4], w[ 3], selector); + w[60] = __byte_perm_S (w[ 3], w[ 2], selector); + w[59] = __byte_perm_S (w[ 2], w[ 1], selector); + w[58] = __byte_perm_S (w[ 1], w[ 0], selector); + w[57] = __byte_perm_S (w[ 0], 0, selector); + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 58: + w[63] = __byte_perm_S (w[ 5], w[ 4], selector); + w[62] = __byte_perm_S (w[ 4], w[ 3], selector); + w[61] = __byte_perm_S (w[ 3], w[ 2], selector); + w[60] = __byte_perm_S (w[ 2], w[ 1], selector); + w[59] = __byte_perm_S (w[ 1], w[ 0], selector); + w[58] = __byte_perm_S (w[ 0], 0, selector); + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 59: + w[63] = __byte_perm_S (w[ 4], w[ 3], selector); + w[62] = __byte_perm_S (w[ 3], w[ 2], selector); + w[61] = __byte_perm_S (w[ 2], w[ 1], selector); + w[60] = __byte_perm_S (w[ 1], w[ 0], selector); + w[59] = __byte_perm_S (w[ 0], 0, selector); + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 60: + w[63] = __byte_perm_S (w[ 3], w[ 2], selector); + w[62] = __byte_perm_S (w[ 2], w[ 1], selector); + w[61] = __byte_perm_S (w[ 1], w[ 0], selector); + w[60] = __byte_perm_S (w[ 0], 0, selector); + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 61: + w[63] = __byte_perm_S (w[ 2], w[ 1], selector); + w[62] = __byte_perm_S (w[ 1], w[ 0], selector); + w[61] = __byte_perm_S (w[ 0], 0, selector); + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 62: + w[63] = __byte_perm_S (w[ 1], w[ 0], selector); + w[62] = __byte_perm_S (w[ 0], 0, selector); + w[61] = 0; + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + + case 63: + w[63] = __byte_perm_S (w[ 0], 0, selector); + w[62] = 0; + w[61] = 0; + w[60] = 0; + w[59] = 0; + w[58] = 0; + w[57] = 0; + w[56] = 0; + w[55] = 0; + w[54] = 0; + w[53] = 0; + w[52] = 0; + w[51] = 0; + w[50] = 0; + w[49] = 0; + w[48] = 0; + w[47] = 0; + w[46] = 0; + w[45] = 0; + w[44] = 0; + w[43] = 0; + w[42] = 0; + w[41] = 0; + w[40] = 0; + w[39] = 0; + w[38] = 0; + w[37] = 0; + w[36] = 0; + w[35] = 0; + w[34] = 0; + w[33] = 0; + w[32] = 0; + w[31] = 0; + w[30] = 0; + w[29] = 0; + w[28] = 0; + w[27] = 0; + w[26] = 0; + w[25] = 0; + w[24] = 0; + w[23] = 0; + w[22] = 0; + w[21] = 0; + w[20] = 0; + w[19] = 0; + w[18] = 0; + w[17] = 0; + w[16] = 0; + w[15] = 0; + w[14] = 0; + w[13] = 0; + w[12] = 0; + w[11] = 0; + w[10] = 0; + w[ 9] = 0; + w[ 8] = 0; + w[ 7] = 0; + w[ 6] = 0; + w[ 5] = 0; + w[ 4] = 0; + w[ 3] = 0; + w[ 2] = 0; + w[ 1] = 0; + w[ 0] = 0; + + break; + } + #endif +} + /** * vector functions on scalar types (for inner loop usage) */ diff --git a/tools/code_generators/GEN_AMD_switch_buffer_by_offset_1x64_be_S.pl b/tools/code_generators/GEN_AMD_switch_buffer_by_offset_1x64_be_S.pl new file mode 100644 index 000000000..06ebdd6c6 --- /dev/null +++ b/tools/code_generators/GEN_AMD_switch_buffer_by_offset_1x64_be_S.pl @@ -0,0 +1,28 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +for (my $i = 0, my $s = 0; $i < 64; $i++, $s++) +{ + printf (" case %2d:\n", $i); + for (my $j = 64 - 1; $j >= 0; $j--) + { + if ((($j - $s - 1) >= 0) && (($j - $s - 0) >= 0)) + { + printf (" w[%2d] = amd_bytealign_S (w[%2d], w[%2d], offset);\n", $j, $j - $s - 1, $j - $s - 0); + } + elsif (($j - $s - 0) >= 0) + { + printf (" w[%2d] = amd_bytealign_S ( 0, w[%2d], offset);\n", $j, $j - $s - 0); + } + else + { + printf (" w[%2d] = 0;\n", $j); + } + } + printf ("\n"); + + printf (" break;\n"); + printf ("\n"); +} diff --git a/tools/code_generators/GEN_NV_switch_buffer_by_offset_1x64_be_S.pl b/tools/code_generators/GEN_NV_switch_buffer_by_offset_1x64_be_S.pl new file mode 100644 index 000000000..ade31c6d9 --- /dev/null +++ b/tools/code_generators/GEN_NV_switch_buffer_by_offset_1x64_be_S.pl @@ -0,0 +1,29 @@ +#!/usr/bin/perl + +use strict; +use warnings; + +for (my $i = 0, my $s = 0; $i < 64; $i++, $s++) +{ + printf (" case %2d:\n", $i); + + for (my $j = 64 - 1; $j >= 0; $j--) + { + if ((($j - $s - 1) >= 0) && (($j - $s - 0) >= 0)) + { + printf (" w[%2d] = __byte_perm_S (w[%2d], w[%2d], selector);\n", $j, $j - $s - 0, $j - $s - 1); + } + elsif (($j - $s - 0) >= 0) + { + printf (" w[%2d] = __byte_perm_S (w[%2d], 0, selector);\n", $j, $j - $s - 0); + } + else + { + printf (" w[%2d] = 0;\n", $j); + } + } + printf ("\n"); + + printf (" break;\n"); + printf ("\n"); +}