/** * Author......: See docs/credits.txt * License.....: MIT */ __constant u32a c_append_helper_mini[16][4] = { { 0x000000ff, 0x00000000, 0x00000000, 0x00000000 }, { 0x0000ff00, 0x00000000, 0x00000000, 0x00000000 }, { 0x00ff0000, 0x00000000, 0x00000000, 0x00000000 }, { 0xff000000, 0x00000000, 0x00000000, 0x00000000 }, { 0x00000000, 0x000000ff, 0x00000000, 0x00000000 }, { 0x00000000, 0x0000ff00, 0x00000000, 0x00000000 }, { 0x00000000, 0x00ff0000, 0x00000000, 0x00000000 }, { 0x00000000, 0xff000000, 0x00000000, 0x00000000 }, { 0x00000000, 0x00000000, 0x000000ff, 0x00000000 }, { 0x00000000, 0x00000000, 0x0000ff00, 0x00000000 }, { 0x00000000, 0x00000000, 0x00ff0000, 0x00000000 }, { 0x00000000, 0x00000000, 0xff000000, 0x00000000 }, { 0x00000000, 0x00000000, 0x00000000, 0x000000ff }, { 0x00000000, 0x00000000, 0x00000000, 0x0000ff00 }, { 0x00000000, 0x00000000, 0x00000000, 0x00ff0000 }, { 0x00000000, 0x00000000, 0x00000000, 0xff000000 }, }; /** * pure scalar functions */ DECLSPEC int ffz (const u32 v) { #ifdef _unroll #pragma unroll #endif for (int i = 0; i < 32; i++) { if ((v >> i) & 1) continue; return i; } return -1; } DECLSPEC int hash_comp (const u32 d1[4], __global const u32 *d2) { if (d1[3] > d2[DGST_R3]) return ( 1); if (d1[3] < d2[DGST_R3]) return (-1); if (d1[2] > d2[DGST_R2]) return ( 1); if (d1[2] < d2[DGST_R2]) return (-1); if (d1[1] > d2[DGST_R1]) return ( 1); if (d1[1] < d2[DGST_R1]) return (-1); if (d1[0] > d2[DGST_R0]) return ( 1); if (d1[0] < d2[DGST_R0]) return (-1); return (0); } DECLSPEC int find_hash (const u32 digest[4], const u32 digests_cnt, __global const digest_t *digests_buf) { for (u32 l = 0, r = digests_cnt; r; r >>= 1) { const u32 m = r >> 1; const u32 c = l + m; const int cmp = hash_comp (digest, digests_buf[c].digest_buf); if (cmp > 0) { l += m + 1; r--; } if (cmp == 0) return (c); } return (-1); } DECLSPEC u32 check_bitmap (__global const u32 *bitmap, const u32 bitmap_mask, const u32 bitmap_shift, const u32 digest) { return (bitmap[(digest >> bitmap_shift) & bitmap_mask] & (1 << (digest & 0x1f))); } DECLSPEC u32 check (const u32 digest[4], __global const u32 *bitmap_s1_a, __global const u32 *bitmap_s1_b, __global const u32 *bitmap_s1_c, __global const u32 *bitmap_s1_d, __global const u32 *bitmap_s2_a, __global const u32 *bitmap_s2_b, __global const u32 *bitmap_s2_c, __global const u32 *bitmap_s2_d, const u32 bitmap_mask, const u32 bitmap_shift1, const u32 bitmap_shift2) { if (check_bitmap (bitmap_s1_a, bitmap_mask, bitmap_shift1, digest[0]) == 0) return (0); if (check_bitmap (bitmap_s1_b, bitmap_mask, bitmap_shift1, digest[1]) == 0) return (0); if (check_bitmap (bitmap_s1_c, bitmap_mask, bitmap_shift1, digest[2]) == 0) return (0); if (check_bitmap (bitmap_s1_d, bitmap_mask, bitmap_shift1, digest[3]) == 0) return (0); if (check_bitmap (bitmap_s2_a, bitmap_mask, bitmap_shift2, digest[0]) == 0) return (0); if (check_bitmap (bitmap_s2_b, bitmap_mask, bitmap_shift2, digest[1]) == 0) return (0); if (check_bitmap (bitmap_s2_c, bitmap_mask, bitmap_shift2, digest[2]) == 0) return (0); if (check_bitmap (bitmap_s2_d, bitmap_mask, bitmap_shift2, digest[3]) == 0) return (0); return (1); } DECLSPEC void mark_hash (__global plain_t *plains_buf, __global u32 *d_result, const u32 salt_pos, const u32 digests_cnt, const u32 digest_pos, const u32 hash_pos, const u32 gid, const u32 il_pos) { const u32 idx = atomic_inc (d_result); if (idx >= digests_cnt) { // this is kind of tricky: we *must* call atomic_inc() to know about the current value from a multi-thread perspective // this action creates a buffer overflow, so we need to fix it here atomic_dec (d_result); return; } plains_buf[idx].salt_pos = salt_pos; plains_buf[idx].digest_pos = digest_pos; // relative plains_buf[idx].hash_pos = hash_pos; // absolute plains_buf[idx].gidvid = gid; plains_buf[idx].il_pos = il_pos; } DECLSPEC int count_char (const u32 *buf, const int elems, const u32 c) { int r = 0; for (int i = 0; i < elems; i++) { const u32 v = buf[i]; if (((v >> 0) & 0xff) == c) r++; if (((v >> 8) & 0xff) == c) r++; if (((v >> 16) & 0xff) == c) r++; if (((v >> 24) & 0xff) == c) r++; } return r; } DECLSPEC float get_entropy (const u32 *buf, const int elems) { const int length = elems * 4; float entropy = 0.0; #ifdef _unroll #pragma unroll #endif for (u32 c = 0; c < 256; c++) { const int r = count_char (buf, elems, c); if (r == 0) continue; float w = (float) r / length; entropy += -w * log2 (w); } return entropy; } DECLSPEC int is_valid_hex_8 (const u8 v) { // direct lookup table is slower thanks to CMOV if ((v >= '0') && (v <= '9')) return 1; if ((v >= 'a') && (v <= 'f')) return 1; return 0; } DECLSPEC int is_valid_hex_32 (const u32 v) { if (is_valid_hex_8 ((u8) (v >> 0)) == 0) return 0; if (is_valid_hex_8 ((u8) (v >> 8)) == 0) return 0; if (is_valid_hex_8 ((u8) (v >> 16)) == 0) return 0; if (is_valid_hex_8 ((u8) (v >> 24)) == 0) return 0; return 1; } /** * vector functions */ DECLSPEC void make_utf16be (const u32x in[4], u32x out1[4], u32x out2[4]) { #if defined IS_NV out2[3] = __byte_perm (in[3], 0, 0x3727); out2[2] = __byte_perm (in[3], 0, 0x1707); out2[1] = __byte_perm (in[2], 0, 0x3727); out2[0] = __byte_perm (in[2], 0, 0x1707); out1[3] = __byte_perm (in[1], 0, 0x3727); out1[2] = __byte_perm (in[1], 0, 0x1707); out1[1] = __byte_perm (in[0], 0, 0x3727); out1[0] = __byte_perm (in[0], 0, 0x1707); #elif defined IS_AMD && AMD_GCN >= 3 out2[3] = __byte_perm (in[3], 0, 0x03070207); out2[2] = __byte_perm (in[3], 0, 0x01070007); out2[1] = __byte_perm (in[2], 0, 0x03070207); out2[0] = __byte_perm (in[2], 0, 0x01070007); out1[3] = __byte_perm (in[1], 0, 0x03070207); out1[2] = __byte_perm (in[1], 0, 0x01070007); out1[1] = __byte_perm (in[0], 0, 0x03070207); out1[0] = __byte_perm (in[0], 0, 0x01070007); #else out2[3] = ((in[3] >> 0) & 0xFF000000) | ((in[3] >> 8) & 0x0000FF00); out2[2] = ((in[3] << 16) & 0xFF000000) | ((in[3] << 8) & 0x0000FF00); out2[1] = ((in[2] >> 0) & 0xFF000000) | ((in[2] >> 8) & 0x0000FF00); out2[0] = ((in[2] << 16) & 0xFF000000) | ((in[2] << 8) & 0x0000FF00); out1[3] = ((in[1] >> 0) & 0xFF000000) | ((in[1] >> 8) & 0x0000FF00); out1[2] = ((in[1] << 16) & 0xFF000000) | ((in[1] << 8) & 0x0000FF00); out1[1] = ((in[0] >> 0) & 0xFF000000) | ((in[0] >> 8) & 0x0000FF00); out1[0] = ((in[0] << 16) & 0xFF000000) | ((in[0] << 8) & 0x0000FF00); #endif } DECLSPEC void make_utf16beN (const u32x in[4], u32x out1[4], u32x out2[4]) { #if defined IS_NV out2[3] = __byte_perm (in[3], 0, 0x1707); out2[2] = __byte_perm (in[3], 0, 0x3727); out2[1] = __byte_perm (in[2], 0, 0x1707); out2[0] = __byte_perm (in[2], 0, 0x3727); out1[3] = __byte_perm (in[1], 0, 0x1707); out1[2] = __byte_perm (in[1], 0, 0x3727); out1[1] = __byte_perm (in[0], 0, 0x1707); out1[0] = __byte_perm (in[0], 0, 0x3727); #elif defined IS_AMD && AMD_GCN >= 3 out2[3] = __byte_perm (in[3], 0, 0x01070007); out2[2] = __byte_perm (in[3], 0, 0x03070207); out2[1] = __byte_perm (in[2], 0, 0x01070007); out2[0] = __byte_perm (in[2], 0, 0x03070207); out1[3] = __byte_perm (in[1], 0, 0x01070007); out1[2] = __byte_perm (in[1], 0, 0x03070207); out1[1] = __byte_perm (in[0], 0, 0x01070007); out1[0] = __byte_perm (in[0], 0, 0x03070207); #else out2[3] = ((in[3] << 16) & 0xFF000000) | ((in[3] << 8) & 0x0000FF00); out2[2] = ((in[3] >> 0) & 0xFF000000) | ((in[3] >> 8) & 0x0000FF00); out2[1] = ((in[2] << 16) & 0xFF000000) | ((in[2] << 8) & 0x0000FF00); out2[0] = ((in[2] >> 0) & 0xFF000000) | ((in[2] >> 8) & 0x0000FF00); out1[3] = ((in[1] << 16) & 0xFF000000) | ((in[1] << 8) & 0x0000FF00); out1[2] = ((in[1] >> 0) & 0xFF000000) | ((in[1] >> 8) & 0x0000FF00); out1[1] = ((in[0] << 16) & 0xFF000000) | ((in[0] << 8) & 0x0000FF00); out1[0] = ((in[0] >> 0) & 0xFF000000) | ((in[0] >> 8) & 0x0000FF00); #endif } DECLSPEC void make_utf16le (const u32x in[4], u32x out1[4], u32x out2[4]) { #if defined IS_NV out2[3] = __byte_perm (in[3], 0, 0x7372); out2[2] = __byte_perm (in[3], 0, 0x7170); out2[1] = __byte_perm (in[2], 0, 0x7372); out2[0] = __byte_perm (in[2], 0, 0x7170); out1[3] = __byte_perm (in[1], 0, 0x7372); out1[2] = __byte_perm (in[1], 0, 0x7170); out1[1] = __byte_perm (in[0], 0, 0x7372); out1[0] = __byte_perm (in[0], 0, 0x7170); #elif defined IS_AMD && AMD_GCN >= 3 out2[3] = __byte_perm (in[3], 0, 0x07030702); out2[2] = __byte_perm (in[3], 0, 0x07010700); out2[1] = __byte_perm (in[2], 0, 0x07030702); out2[0] = __byte_perm (in[2], 0, 0x07010700); out1[3] = __byte_perm (in[1], 0, 0x07030702); out1[2] = __byte_perm (in[1], 0, 0x07010700); out1[1] = __byte_perm (in[0], 0, 0x07030702); out1[0] = __byte_perm (in[0], 0, 0x07010700); #else out2[3] = ((in[3] >> 8) & 0x00FF0000) | ((in[3] >> 16) & 0x000000FF); out2[2] = ((in[3] << 8) & 0x00FF0000) | ((in[3] >> 0) & 0x000000FF); out2[1] = ((in[2] >> 8) & 0x00FF0000) | ((in[2] >> 16) & 0x000000FF); out2[0] = ((in[2] << 8) & 0x00FF0000) | ((in[2] >> 0) & 0x000000FF); out1[3] = ((in[1] >> 8) & 0x00FF0000) | ((in[1] >> 16) & 0x000000FF); out1[2] = ((in[1] << 8) & 0x00FF0000) | ((in[1] >> 0) & 0x000000FF); out1[1] = ((in[0] >> 8) & 0x00FF0000) | ((in[0] >> 16) & 0x000000FF); out1[0] = ((in[0] << 8) & 0x00FF0000) | ((in[0] >> 0) & 0x000000FF); #endif } DECLSPEC void make_utf16leN (const u32x in[4], u32x out1[4], u32x out2[4]) { #if defined IS_NV out2[3] = __byte_perm (in[3], 0, 0x7170); out2[2] = __byte_perm (in[3], 0, 0x7372); out2[1] = __byte_perm (in[2], 0, 0x7170); out2[0] = __byte_perm (in[2], 0, 0x7372); out1[3] = __byte_perm (in[1], 0, 0x7170); out1[2] = __byte_perm (in[1], 0, 0x7372); out1[1] = __byte_perm (in[0], 0, 0x7170); out1[0] = __byte_perm (in[0], 0, 0x7372); #elif defined IS_AMD && AMD_GCN >= 3 out2[3] = __byte_perm (in[3], 0, 0x07010700); out2[2] = __byte_perm (in[3], 0, 0x07030702); out2[1] = __byte_perm (in[2], 0, 0x07010700); out2[0] = __byte_perm (in[2], 0, 0x07030702); out1[3] = __byte_perm (in[1], 0, 0x07010700); out1[2] = __byte_perm (in[1], 0, 0x07030702); out1[1] = __byte_perm (in[0], 0, 0x07010700); out1[0] = __byte_perm (in[0], 0, 0x07030702); #else out2[3] = ((in[3] << 8) & 0x00FF0000) | ((in[3] >> 0) & 0x000000FF); out2[2] = ((in[3] >> 8) & 0x00FF0000) | ((in[3] >> 16) & 0x000000FF); out2[1] = ((in[2] << 8) & 0x00FF0000) | ((in[2] >> 0) & 0x000000FF); out2[0] = ((in[2] >> 8) & 0x00FF0000) | ((in[2] >> 16) & 0x000000FF); out1[3] = ((in[1] << 8) & 0x00FF0000) | ((in[1] >> 0) & 0x000000FF); out1[2] = ((in[1] >> 8) & 0x00FF0000) | ((in[1] >> 16) & 0x000000FF); out1[1] = ((in[0] << 8) & 0x00FF0000) | ((in[0] >> 0) & 0x000000FF); out1[0] = ((in[0] >> 8) & 0x00FF0000) | ((in[0] >> 16) & 0x000000FF); #endif } DECLSPEC void undo_utf16be (const u32x in1[4], const u32x in2[4], u32x out[4]) { #if defined IS_NV out[0] = __byte_perm (in1[0], in1[1], 0x4602); out[1] = __byte_perm (in1[2], in1[3], 0x4602); out[2] = __byte_perm (in2[0], in2[1], 0x4602); out[3] = __byte_perm (in2[2], in2[3], 0x4602); #elif defined IS_AMD && AMD_GCN >= 3 out[0] = __byte_perm (in1[0], in1[1], 0x04060002); out[1] = __byte_perm (in1[2], in1[3], 0x04060002); out[2] = __byte_perm (in2[0], in2[1], 0x04060002); out[3] = __byte_perm (in2[2], in2[3], 0x04060002); #else out[0] = ((in1[0] & 0x0000ff00) >> 8) | ((in1[0] & 0xff000000) >> 16) | ((in1[1] & 0x0000ff00) << 8) | ((in1[1] & 0xff000000) << 0); out[1] = ((in1[2] & 0x0000ff00) >> 8) | ((in1[2] & 0xff000000) >> 16) | ((in1[3] & 0x0000ff00) << 8) | ((in1[3] & 0xff000000) << 0); out[2] = ((in2[0] & 0x0000ff00) >> 8) | ((in2[0] & 0xff000000) >> 16) | ((in2[1] & 0x0000ff00) << 8) | ((in2[1] & 0xff000000) << 0); out[3] = ((in2[2] & 0x0000ff00) >> 8) | ((in2[2] & 0xff000000) >> 16) | ((in2[3] & 0x0000ff00) << 8) | ((in2[3] & 0xff000000) << 0); #endif } DECLSPEC void undo_utf16le (const u32x in1[4], const u32x in2[4], u32x out[4]) { #if defined IS_NV out[0] = __byte_perm (in1[0], in1[1], 0x6420); out[1] = __byte_perm (in1[2], in1[3], 0x6420); out[2] = __byte_perm (in2[0], in2[1], 0x6420); out[3] = __byte_perm (in2[2], in2[3], 0x6420); #elif defined IS_AMD && AMD_GCN >= 3 out[0] = __byte_perm (in1[0], in1[1], 0x06040200); out[1] = __byte_perm (in1[2], in1[3], 0x06040200); out[2] = __byte_perm (in2[0], in2[1], 0x06040200); out[3] = __byte_perm (in2[2], in2[3], 0x06040200); #else out[0] = ((in1[0] & 0x000000ff) >> 0) | ((in1[0] & 0x00ff0000) >> 8) | ((in1[1] & 0x000000ff) << 16) | ((in1[1] & 0x00ff0000) << 8); out[1] = ((in1[2] & 0x000000ff) >> 0) | ((in1[2] & 0x00ff0000) >> 8) | ((in1[3] & 0x000000ff) << 16) | ((in1[3] & 0x00ff0000) << 8); out[2] = ((in2[0] & 0x000000ff) >> 0) | ((in2[0] & 0x00ff0000) >> 8) | ((in2[1] & 0x000000ff) << 16) | ((in2[1] & 0x00ff0000) << 8); out[3] = ((in2[2] & 0x000000ff) >> 0) | ((in2[2] & 0x00ff0000) >> 8) | ((in2[3] & 0x000000ff) << 16) | ((in2[3] & 0x00ff0000) << 8); #endif } DECLSPEC void append_helper_1x4 (u32x r[4], const u32 v, const u32 m[4]) { r[0] |= v & m[0]; r[1] |= v & m[1]; r[2] |= v & m[2]; r[3] |= v & m[3]; } DECLSPEC void append_0x80_1x4 (u32x w0[4], const u32 offset) { const u32 v[4] = { c_append_helper_mini[offset & 0xf][0], c_append_helper_mini[offset & 0xf][1], c_append_helper_mini[offset & 0xf][2], c_append_helper_mini[offset & 0xf][3] }; append_helper_1x4 (w0, 0x80808080, v); } DECLSPEC void append_0x80_2x4 (u32x w0[4], u32x w1[4], const u32 offset) { const u32 v[4] = { c_append_helper_mini[offset & 0xf][0], c_append_helper_mini[offset & 0xf][1], c_append_helper_mini[offset & 0xf][2], c_append_helper_mini[offset & 0xf][3] }; const u32 offset16 = offset / 16; append_helper_1x4 (w0, ((offset16 == 0) ? 0x80808080 : 0), v); append_helper_1x4 (w1, ((offset16 == 1) ? 0x80808080 : 0), v); } DECLSPEC void append_0x80_3x4 (u32x w0[4], u32x w1[4], u32x w2[4], const u32 offset) { const u32 v[4] = { c_append_helper_mini[offset & 0xf][0], c_append_helper_mini[offset & 0xf][1], c_append_helper_mini[offset & 0xf][2], c_append_helper_mini[offset & 0xf][3] }; const u32 offset16 = offset / 16; append_helper_1x4 (w0, ((offset16 == 0) ? 0x80808080 : 0), v); append_helper_1x4 (w1, ((offset16 == 1) ? 0x80808080 : 0), v); append_helper_1x4 (w2, ((offset16 == 2) ? 0x80808080 : 0), v); } DECLSPEC void append_0x80_4x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset) { const u32 v[4] = { c_append_helper_mini[offset & 0xf][0], c_append_helper_mini[offset & 0xf][1], c_append_helper_mini[offset & 0xf][2], c_append_helper_mini[offset & 0xf][3] }; const u32 offset16 = offset / 16; append_helper_1x4 (w0, ((offset16 == 0) ? 0x80808080 : 0), v); append_helper_1x4 (w1, ((offset16 == 1) ? 0x80808080 : 0), v); append_helper_1x4 (w2, ((offset16 == 2) ? 0x80808080 : 0), v); append_helper_1x4 (w3, ((offset16 == 3) ? 0x80808080 : 0), v); } DECLSPEC void append_0x80_8x4 (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset) { const u32 v[4] = { c_append_helper_mini[offset & 0xf][0], c_append_helper_mini[offset & 0xf][1], c_append_helper_mini[offset & 0xf][2], c_append_helper_mini[offset & 0xf][3] }; const u32 offset16 = offset / 16; append_helper_1x4 (w0, ((offset16 == 0) ? 0x80808080 : 0), v); append_helper_1x4 (w1, ((offset16 == 1) ? 0x80808080 : 0), v); append_helper_1x4 (w2, ((offset16 == 2) ? 0x80808080 : 0), v); append_helper_1x4 (w3, ((offset16 == 3) ? 0x80808080 : 0), v); append_helper_1x4 (w4, ((offset16 == 4) ? 0x80808080 : 0), v); append_helper_1x4 (w5, ((offset16 == 5) ? 0x80808080 : 0), v); append_helper_1x4 (w6, ((offset16 == 6) ? 0x80808080 : 0), v); append_helper_1x4 (w7, ((offset16 == 7) ? 0x80808080 : 0), v); } DECLSPEC void append_0x80_1x16 (u32x w[16], const u32 offset) { const u32 v[4] = { c_append_helper_mini[offset & 0xf][0], c_append_helper_mini[offset & 0xf][1], c_append_helper_mini[offset & 0xf][2], c_append_helper_mini[offset & 0xf][3] }; const u32 offset16 = offset / 16; append_helper_1x4 (w + 0, ((offset16 == 0) ? 0x80808080 : 0), v); append_helper_1x4 (w + 4, ((offset16 == 1) ? 0x80808080 : 0), v); append_helper_1x4 (w + 8, ((offset16 == 2) ? 0x80808080 : 0), v); append_helper_1x4 (w + 12, ((offset16 == 3) ? 0x80808080 : 0), v); } DECLSPEC void switch_buffer_by_offset_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset) { const int offset_mod_4 = offset & 3; const int offset_minus_4 = 4 - offset_mod_4; const int offset_switch = offset / 4; #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC w0[0] = swap32 (w0[0]); w0[1] = swap32 (w0[1]); w0[2] = swap32 (w0[2]); w0[3] = swap32 (w0[3]); w1[0] = swap32 (w1[0]); w1[1] = swap32 (w1[1]); w1[2] = swap32 (w1[2]); w1[3] = swap32 (w1[3]); w2[0] = swap32 (w2[0]); w2[1] = swap32 (w2[1]); w2[2] = swap32 (w2[2]); w2[3] = swap32 (w2[3]); w3[0] = swap32 (w3[0]); w3[1] = swap32 (w3[1]); w3[2] = swap32 (w3[2]); w3[3] = swap32 (w3[3]); switch (offset_switch) { case 0: w3[3] = amd_bytealign (w3[2], w3[3], offset); w3[2] = amd_bytealign (w3[1], w3[2], offset); w3[1] = amd_bytealign (w3[0], w3[1], offset); w3[0] = amd_bytealign (w2[3], w3[0], offset); w2[3] = amd_bytealign (w2[2], w2[3], offset); w2[2] = amd_bytealign (w2[1], w2[2], offset); w2[1] = amd_bytealign (w2[0], w2[1], offset); w2[0] = amd_bytealign (w1[3], w2[0], offset); w1[3] = amd_bytealign (w1[2], w1[3], offset); w1[2] = amd_bytealign (w1[1], w1[2], offset); w1[1] = amd_bytealign (w1[0], w1[1], offset); w1[0] = amd_bytealign (w0[3], w1[0], offset); w0[3] = amd_bytealign (w0[2], w0[3], offset); w0[2] = amd_bytealign (w0[1], w0[2], offset); w0[1] = amd_bytealign (w0[0], w0[1], offset); w0[0] = amd_bytealign ( 0, w0[0], offset); break; case 1: w3[3] = amd_bytealign (w3[1], w3[2], offset); w3[2] = amd_bytealign (w3[0], w3[1], offset); w3[1] = amd_bytealign (w2[3], w3[0], offset); w3[0] = amd_bytealign (w2[2], w2[3], offset); w2[3] = amd_bytealign (w2[1], w2[2], offset); w2[2] = amd_bytealign (w2[0], w2[1], offset); w2[1] = amd_bytealign (w1[3], w2[0], offset); w2[0] = amd_bytealign (w1[2], w1[3], offset); w1[3] = amd_bytealign (w1[1], w1[2], offset); w1[2] = amd_bytealign (w1[0], w1[1], offset); w1[1] = amd_bytealign (w0[3], w1[0], offset); w1[0] = amd_bytealign (w0[2], w0[3], offset); w0[3] = amd_bytealign (w0[1], w0[2], offset); w0[2] = amd_bytealign (w0[0], w0[1], offset); w0[1] = amd_bytealign ( 0, w0[0], offset); w0[0] = 0; break; case 2: w3[3] = amd_bytealign (w3[0], w3[1], offset); w3[2] = amd_bytealign (w2[3], w3[0], offset); w3[1] = amd_bytealign (w2[2], w2[3], offset); w3[0] = amd_bytealign (w2[1], w2[2], offset); w2[3] = amd_bytealign (w2[0], w2[1], offset); w2[2] = amd_bytealign (w1[3], w2[0], offset); w2[1] = amd_bytealign (w1[2], w1[3], offset); w2[0] = amd_bytealign (w1[1], w1[2], offset); w1[3] = amd_bytealign (w1[0], w1[1], offset); w1[2] = amd_bytealign (w0[3], w1[0], offset); w1[1] = amd_bytealign (w0[2], w0[3], offset); w1[0] = amd_bytealign (w0[1], w0[2], offset); w0[3] = amd_bytealign (w0[0], w0[1], offset); w0[2] = amd_bytealign ( 0, w0[0], offset); w0[1] = 0; w0[0] = 0; break; case 3: w3[3] = amd_bytealign (w2[3], w3[0], offset); w3[2] = amd_bytealign (w2[2], w2[3], offset); w3[1] = amd_bytealign (w2[1], w2[2], offset); w3[0] = amd_bytealign (w2[0], w2[1], offset); w2[3] = amd_bytealign (w1[3], w2[0], offset); w2[2] = amd_bytealign (w1[2], w1[3], offset); w2[1] = amd_bytealign (w1[1], w1[2], offset); w2[0] = amd_bytealign (w1[0], w1[1], offset); w1[3] = amd_bytealign (w0[3], w1[0], offset); w1[2] = amd_bytealign (w0[2], w0[3], offset); w1[1] = amd_bytealign (w0[1], w0[2], offset); w1[0] = amd_bytealign (w0[0], w0[1], offset); w0[3] = amd_bytealign ( 0, w0[0], offset); w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 4: w3[3] = amd_bytealign (w2[2], w2[3], offset); w3[2] = amd_bytealign (w2[1], w2[2], offset); w3[1] = amd_bytealign (w2[0], w2[1], offset); w3[0] = amd_bytealign (w1[3], w2[0], offset); w2[3] = amd_bytealign (w1[2], w1[3], offset); w2[2] = amd_bytealign (w1[1], w1[2], offset); w2[1] = amd_bytealign (w1[0], w1[1], offset); w2[0] = amd_bytealign (w0[3], w1[0], offset); w1[3] = amd_bytealign (w0[2], w0[3], offset); w1[2] = amd_bytealign (w0[1], w0[2], offset); w1[1] = amd_bytealign (w0[0], w0[1], offset); w1[0] = amd_bytealign ( 0, w0[0], offset); w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 5: w3[3] = amd_bytealign (w2[1], w2[2], offset); w3[2] = amd_bytealign (w2[0], w2[1], offset); w3[1] = amd_bytealign (w1[3], w2[0], offset); w3[0] = amd_bytealign (w1[2], w1[3], offset); w2[3] = amd_bytealign (w1[1], w1[2], offset); w2[2] = amd_bytealign (w1[0], w1[1], offset); w2[1] = amd_bytealign (w0[3], w1[0], offset); w2[0] = amd_bytealign (w0[2], w0[3], offset); w1[3] = amd_bytealign (w0[1], w0[2], offset); w1[2] = amd_bytealign (w0[0], w0[1], offset); w1[1] = amd_bytealign ( 0, w0[0], offset); w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 6: w3[3] = amd_bytealign (w2[0], w2[1], offset); w3[2] = amd_bytealign (w1[3], w2[0], offset); w3[1] = amd_bytealign (w1[2], w1[3], offset); w3[0] = amd_bytealign (w1[1], w1[2], offset); w2[3] = amd_bytealign (w1[0], w1[1], offset); w2[2] = amd_bytealign (w0[3], w1[0], offset); w2[1] = amd_bytealign (w0[2], w0[3], offset); w2[0] = amd_bytealign (w0[1], w0[2], offset); w1[3] = amd_bytealign (w0[0], w0[1], offset); w1[2] = amd_bytealign ( 0, w0[0], offset); w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 7: w3[3] = amd_bytealign (w1[3], w2[0], offset); w3[2] = amd_bytealign (w1[2], w1[3], offset); w3[1] = amd_bytealign (w1[1], w1[2], offset); w3[0] = amd_bytealign (w1[0], w1[1], offset); w2[3] = amd_bytealign (w0[3], w1[0], offset); w2[2] = amd_bytealign (w0[2], w0[3], offset); w2[1] = amd_bytealign (w0[1], w0[2], offset); w2[0] = amd_bytealign (w0[0], w0[1], offset); w1[3] = amd_bytealign ( 0, w0[0], offset); w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 8: w3[3] = amd_bytealign (w1[2], w1[3], offset); w3[2] = amd_bytealign (w1[1], w1[2], offset); w3[1] = amd_bytealign (w1[0], w1[1], offset); w3[0] = amd_bytealign (w0[3], w1[0], offset); w2[3] = amd_bytealign (w0[2], w0[3], offset); w2[2] = amd_bytealign (w0[1], w0[2], offset); w2[1] = amd_bytealign (w0[0], w0[1], offset); w2[0] = amd_bytealign ( 0, w0[0], offset); w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 9: w3[3] = amd_bytealign (w1[1], w1[2], offset); w3[2] = amd_bytealign (w1[0], w1[1], offset); w3[1] = amd_bytealign (w0[3], w1[0], offset); w3[0] = amd_bytealign (w0[2], w0[3], offset); w2[3] = amd_bytealign (w0[1], w0[2], offset); w2[2] = amd_bytealign (w0[0], w0[1], offset); w2[1] = amd_bytealign ( 0, w0[0], offset); w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 10: w3[3] = amd_bytealign (w1[0], w1[1], offset); w3[2] = amd_bytealign (w0[3], w1[0], offset); w3[1] = amd_bytealign (w0[2], w0[3], offset); w3[0] = amd_bytealign (w0[1], w0[2], offset); w2[3] = amd_bytealign (w0[0], w0[1], offset); w2[2] = amd_bytealign ( 0, w0[0], offset); w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 11: w3[3] = amd_bytealign (w0[3], w1[0], offset); w3[2] = amd_bytealign (w0[2], w0[3], offset); w3[1] = amd_bytealign (w0[1], w0[2], offset); w3[0] = amd_bytealign (w0[0], w0[1], offset); w2[3] = amd_bytealign ( 0, w0[0], offset); w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 12: w3[3] = amd_bytealign (w0[2], w0[3], offset); w3[2] = amd_bytealign (w0[1], w0[2], offset); w3[1] = amd_bytealign (w0[0], w0[1], offset); w3[0] = amd_bytealign ( 0, w0[0], offset); w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 13: w3[3] = amd_bytealign (w0[1], w0[2], offset); w3[2] = amd_bytealign (w0[0], w0[1], offset); w3[1] = amd_bytealign ( 0, w0[0], offset); w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 14: w3[3] = amd_bytealign (w0[0], w0[1], offset); w3[2] = amd_bytealign ( 0, w0[0], offset); w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 15: w3[3] = amd_bytealign ( 0, w0[0], offset); w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; } w0[0] = swap32 (w0[0]); w0[1] = swap32 (w0[1]); w0[2] = swap32 (w0[2]); w0[3] = swap32 (w0[3]); w1[0] = swap32 (w1[0]); w1[1] = swap32 (w1[1]); w1[2] = swap32 (w1[2]); w1[3] = swap32 (w1[3]); w2[0] = swap32 (w2[0]); w2[1] = swap32 (w2[1]); w2[2] = swap32 (w2[2]); w2[3] = swap32 (w2[3]); w3[0] = swap32 (w3[0]); w3[1] = swap32 (w3[1]); w3[2] = swap32 (w3[2]); w3[3] = swap32 (w3[3]); #endif #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV #if defined IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; #endif #if defined IS_AMD const int selector = 0x0706050403020100 >> (offset_minus_4 * 8); #endif switch (offset_switch) { case 0: w3[3] = __byte_perm (w3[2], w3[3], selector); w3[2] = __byte_perm (w3[1], w3[2], selector); w3[1] = __byte_perm (w3[0], w3[1], selector); w3[0] = __byte_perm (w2[3], w3[0], selector); w2[3] = __byte_perm (w2[2], w2[3], selector); w2[2] = __byte_perm (w2[1], w2[2], selector); w2[1] = __byte_perm (w2[0], w2[1], selector); w2[0] = __byte_perm (w1[3], w2[0], selector); w1[3] = __byte_perm (w1[2], w1[3], selector); w1[2] = __byte_perm (w1[1], w1[2], selector); w1[1] = __byte_perm (w1[0], w1[1], selector); w1[0] = __byte_perm (w0[3], w1[0], selector); w0[3] = __byte_perm (w0[2], w0[3], selector); w0[2] = __byte_perm (w0[1], w0[2], selector); w0[1] = __byte_perm (w0[0], w0[1], selector); w0[0] = __byte_perm ( 0, w0[0], selector); break; case 1: w3[3] = __byte_perm (w3[1], w3[2], selector); w3[2] = __byte_perm (w3[0], w3[1], selector); w3[1] = __byte_perm (w2[3], w3[0], selector); w3[0] = __byte_perm (w2[2], w2[3], selector); w2[3] = __byte_perm (w2[1], w2[2], selector); w2[2] = __byte_perm (w2[0], w2[1], selector); w2[1] = __byte_perm (w1[3], w2[0], selector); w2[0] = __byte_perm (w1[2], w1[3], selector); w1[3] = __byte_perm (w1[1], w1[2], selector); w1[2] = __byte_perm (w1[0], w1[1], selector); w1[1] = __byte_perm (w0[3], w1[0], selector); w1[0] = __byte_perm (w0[2], w0[3], selector); w0[3] = __byte_perm (w0[1], w0[2], selector); w0[2] = __byte_perm (w0[0], w0[1], selector); w0[1] = __byte_perm ( 0, w0[0], selector); w0[0] = 0; break; case 2: w3[3] = __byte_perm (w3[0], w3[1], selector); w3[2] = __byte_perm (w2[3], w3[0], selector); w3[1] = __byte_perm (w2[2], w2[3], selector); w3[0] = __byte_perm (w2[1], w2[2], selector); w2[3] = __byte_perm (w2[0], w2[1], selector); w2[2] = __byte_perm (w1[3], w2[0], selector); w2[1] = __byte_perm (w1[2], w1[3], selector); w2[0] = __byte_perm (w1[1], w1[2], selector); w1[3] = __byte_perm (w1[0], w1[1], selector); w1[2] = __byte_perm (w0[3], w1[0], selector); w1[1] = __byte_perm (w0[2], w0[3], selector); w1[0] = __byte_perm (w0[1], w0[2], selector); w0[3] = __byte_perm (w0[0], w0[1], selector); w0[2] = __byte_perm ( 0, w0[0], selector); w0[1] = 0; w0[0] = 0; break; case 3: w3[3] = __byte_perm (w2[3], w3[0], selector); w3[2] = __byte_perm (w2[2], w2[3], selector); w3[1] = __byte_perm (w2[1], w2[2], selector); w3[0] = __byte_perm (w2[0], w2[1], selector); w2[3] = __byte_perm (w1[3], w2[0], selector); w2[2] = __byte_perm (w1[2], w1[3], selector); w2[1] = __byte_perm (w1[1], w1[2], selector); w2[0] = __byte_perm (w1[0], w1[1], selector); w1[3] = __byte_perm (w0[3], w1[0], selector); w1[2] = __byte_perm (w0[2], w0[3], selector); w1[1] = __byte_perm (w0[1], w0[2], selector); w1[0] = __byte_perm (w0[0], w0[1], selector); w0[3] = __byte_perm ( 0, w0[0], selector); w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 4: w3[3] = __byte_perm (w2[2], w2[3], selector); w3[2] = __byte_perm (w2[1], w2[2], selector); w3[1] = __byte_perm (w2[0], w2[1], selector); w3[0] = __byte_perm (w1[3], w2[0], selector); w2[3] = __byte_perm (w1[2], w1[3], selector); w2[2] = __byte_perm (w1[1], w1[2], selector); w2[1] = __byte_perm (w1[0], w1[1], selector); w2[0] = __byte_perm (w0[3], w1[0], selector); w1[3] = __byte_perm (w0[2], w0[3], selector); w1[2] = __byte_perm (w0[1], w0[2], selector); w1[1] = __byte_perm (w0[0], w0[1], selector); w1[0] = __byte_perm ( 0, w0[0], selector); w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 5: w3[3] = __byte_perm (w2[1], w2[2], selector); w3[2] = __byte_perm (w2[0], w2[1], selector); w3[1] = __byte_perm (w1[3], w2[0], selector); w3[0] = __byte_perm (w1[2], w1[3], selector); w2[3] = __byte_perm (w1[1], w1[2], selector); w2[2] = __byte_perm (w1[0], w1[1], selector); w2[1] = __byte_perm (w0[3], w1[0], selector); w2[0] = __byte_perm (w0[2], w0[3], selector); w1[3] = __byte_perm (w0[1], w0[2], selector); w1[2] = __byte_perm (w0[0], w0[1], selector); w1[1] = __byte_perm ( 0, w0[0], selector); w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 6: w3[3] = __byte_perm (w2[0], w2[1], selector); w3[2] = __byte_perm (w1[3], w2[0], selector); w3[1] = __byte_perm (w1[2], w1[3], selector); w3[0] = __byte_perm (w1[1], w1[2], selector); w2[3] = __byte_perm (w1[0], w1[1], selector); w2[2] = __byte_perm (w0[3], w1[0], selector); w2[1] = __byte_perm (w0[2], w0[3], selector); w2[0] = __byte_perm (w0[1], w0[2], selector); w1[3] = __byte_perm (w0[0], w0[1], selector); w1[2] = __byte_perm ( 0, w0[0], selector); w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 7: w3[3] = __byte_perm (w1[3], w2[0], selector); w3[2] = __byte_perm (w1[2], w1[3], selector); w3[1] = __byte_perm (w1[1], w1[2], selector); w3[0] = __byte_perm (w1[0], w1[1], selector); w2[3] = __byte_perm (w0[3], w1[0], selector); w2[2] = __byte_perm (w0[2], w0[3], selector); w2[1] = __byte_perm (w0[1], w0[2], selector); w2[0] = __byte_perm (w0[0], w0[1], selector); w1[3] = __byte_perm ( 0, w0[0], selector); w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 8: w3[3] = __byte_perm (w1[2], w1[3], selector); w3[2] = __byte_perm (w1[1], w1[2], selector); w3[1] = __byte_perm (w1[0], w1[1], selector); w3[0] = __byte_perm (w0[3], w1[0], selector); w2[3] = __byte_perm (w0[2], w0[3], selector); w2[2] = __byte_perm (w0[1], w0[2], selector); w2[1] = __byte_perm (w0[0], w0[1], selector); w2[0] = __byte_perm ( 0, w0[0], selector); w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 9: w3[3] = __byte_perm (w1[1], w1[2], selector); w3[2] = __byte_perm (w1[0], w1[1], selector); w3[1] = __byte_perm (w0[3], w1[0], selector); w3[0] = __byte_perm (w0[2], w0[3], selector); w2[3] = __byte_perm (w0[1], w0[2], selector); w2[2] = __byte_perm (w0[0], w0[1], selector); w2[1] = __byte_perm ( 0, w0[0], selector); w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 10: w3[3] = __byte_perm (w1[0], w1[1], selector); w3[2] = __byte_perm (w0[3], w1[0], selector); w3[1] = __byte_perm (w0[2], w0[3], selector); w3[0] = __byte_perm (w0[1], w0[2], selector); w2[3] = __byte_perm (w0[0], w0[1], selector); w2[2] = __byte_perm ( 0, w0[0], selector); w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 11: w3[3] = __byte_perm (w0[3], w1[0], selector); w3[2] = __byte_perm (w0[2], w0[3], selector); w3[1] = __byte_perm (w0[1], w0[2], selector); w3[0] = __byte_perm (w0[0], w0[1], selector); w2[3] = __byte_perm ( 0, w0[0], selector); w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 12: w3[3] = __byte_perm (w0[2], w0[3], selector); w3[2] = __byte_perm (w0[1], w0[2], selector); w3[1] = __byte_perm (w0[0], w0[1], selector); w3[0] = __byte_perm ( 0, w0[0], selector); w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 13: w3[3] = __byte_perm (w0[1], w0[2], selector); w3[2] = __byte_perm (w0[0], w0[1], selector); w3[1] = __byte_perm ( 0, w0[0], selector); w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 14: w3[3] = __byte_perm (w0[0], w0[1], selector); w3[2] = __byte_perm ( 0, w0[0], selector); w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 15: w3[3] = __byte_perm ( 0, w0[0], selector); w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; } #endif } DECLSPEC void switch_buffer_by_offset_carry_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x c0[4], u32x c1[4], u32x c2[4], u32x c3[4], const u32 offset) { const int offset_mod_4 = offset & 3; const int offset_minus_4 = 4 - offset_mod_4; const int offset_switch = offset / 4; #if defined IS_AMD || defined IS_GENERIC w0[0] = swap32 (w0[0]); w0[1] = swap32 (w0[1]); w0[2] = swap32 (w0[2]); w0[3] = swap32 (w0[3]); w1[0] = swap32 (w1[0]); w1[1] = swap32 (w1[1]); w1[2] = swap32 (w1[2]); w1[3] = swap32 (w1[3]); w2[0] = swap32 (w2[0]); w2[1] = swap32 (w2[1]); w2[2] = swap32 (w2[2]); w2[3] = swap32 (w2[3]); w3[0] = swap32 (w3[0]); w3[1] = swap32 (w3[1]); w3[2] = swap32 (w3[2]); w3[3] = swap32 (w3[3]); switch (offset_switch) { case 0: c0[0] = amd_bytealign (w3[3], 0, offset); w3[3] = amd_bytealign (w3[2], w3[3], offset); w3[2] = amd_bytealign (w3[1], w3[2], offset); w3[1] = amd_bytealign (w3[0], w3[1], offset); w3[0] = amd_bytealign (w2[3], w3[0], offset); w2[3] = amd_bytealign (w2[2], w2[3], offset); w2[2] = amd_bytealign (w2[1], w2[2], offset); w2[1] = amd_bytealign (w2[0], w2[1], offset); w2[0] = amd_bytealign (w1[3], w2[0], offset); w1[3] = amd_bytealign (w1[2], w1[3], offset); w1[2] = amd_bytealign (w1[1], w1[2], offset); w1[1] = amd_bytealign (w1[0], w1[1], offset); w1[0] = amd_bytealign (w0[3], w1[0], offset); w0[3] = amd_bytealign (w0[2], w0[3], offset); w0[2] = amd_bytealign (w0[1], w0[2], offset); w0[1] = amd_bytealign (w0[0], w0[1], offset); w0[0] = amd_bytealign ( 0, w0[0], offset); break; case 1: c0[1] = amd_bytealign (w3[3], 0, offset); c0[0] = amd_bytealign (w3[2], w3[3], offset); w3[3] = amd_bytealign (w3[1], w3[2], offset); w3[2] = amd_bytealign (w3[0], w3[1], offset); w3[1] = amd_bytealign (w2[3], w3[0], offset); w3[0] = amd_bytealign (w2[2], w2[3], offset); w2[3] = amd_bytealign (w2[1], w2[2], offset); w2[2] = amd_bytealign (w2[0], w2[1], offset); w2[1] = amd_bytealign (w1[3], w2[0], offset); w2[0] = amd_bytealign (w1[2], w1[3], offset); w1[3] = amd_bytealign (w1[1], w1[2], offset); w1[2] = amd_bytealign (w1[0], w1[1], offset); w1[1] = amd_bytealign (w0[3], w1[0], offset); w1[0] = amd_bytealign (w0[2], w0[3], offset); w0[3] = amd_bytealign (w0[1], w0[2], offset); w0[2] = amd_bytealign (w0[0], w0[1], offset); w0[1] = amd_bytealign ( 0, w0[0], offset); w0[0] = 0; break; case 2: c0[2] = amd_bytealign (w3[3], 0, offset); c0[1] = amd_bytealign (w3[2], w3[3], offset); c0[0] = amd_bytealign (w3[1], w3[2], offset); w3[3] = amd_bytealign (w3[0], w3[1], offset); w3[2] = amd_bytealign (w2[3], w3[0], offset); w3[1] = amd_bytealign (w2[2], w2[3], offset); w3[0] = amd_bytealign (w2[1], w2[2], offset); w2[3] = amd_bytealign (w2[0], w2[1], offset); w2[2] = amd_bytealign (w1[3], w2[0], offset); w2[1] = amd_bytealign (w1[2], w1[3], offset); w2[0] = amd_bytealign (w1[1], w1[2], offset); w1[3] = amd_bytealign (w1[0], w1[1], offset); w1[2] = amd_bytealign (w0[3], w1[0], offset); w1[1] = amd_bytealign (w0[2], w0[3], offset); w1[0] = amd_bytealign (w0[1], w0[2], offset); w0[3] = amd_bytealign (w0[0], w0[1], offset); w0[2] = amd_bytealign ( 0, w0[0], offset); w0[1] = 0; w0[0] = 0; break; case 3: c0[3] = amd_bytealign (w3[3], 0, offset); c0[2] = amd_bytealign (w3[2], w3[3], offset); c0[1] = amd_bytealign (w3[1], w3[2], offset); c0[0] = amd_bytealign (w3[0], w3[1], offset); w3[3] = amd_bytealign (w2[3], w3[0], offset); w3[2] = amd_bytealign (w2[2], w2[3], offset); w3[1] = amd_bytealign (w2[1], w2[2], offset); w3[0] = amd_bytealign (w2[0], w2[1], offset); w2[3] = amd_bytealign (w1[3], w2[0], offset); w2[2] = amd_bytealign (w1[2], w1[3], offset); w2[1] = amd_bytealign (w1[1], w1[2], offset); w2[0] = amd_bytealign (w1[0], w1[1], offset); w1[3] = amd_bytealign (w0[3], w1[0], offset); w1[2] = amd_bytealign (w0[2], w0[3], offset); w1[1] = amd_bytealign (w0[1], w0[2], offset); w1[0] = amd_bytealign (w0[0], w0[1], offset); w0[3] = amd_bytealign ( 0, w0[0], offset); w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 4: c1[0] = amd_bytealign (w3[3], 0, offset); c0[3] = amd_bytealign (w3[2], w3[3], offset); c0[2] = amd_bytealign (w3[1], w3[2], offset); c0[1] = amd_bytealign (w3[0], w3[1], offset); c0[0] = amd_bytealign (w2[3], w3[0], offset); w3[3] = amd_bytealign (w2[2], w2[3], offset); w3[2] = amd_bytealign (w2[1], w2[2], offset); w3[1] = amd_bytealign (w2[0], w2[1], offset); w3[0] = amd_bytealign (w1[3], w2[0], offset); w2[3] = amd_bytealign (w1[2], w1[3], offset); w2[2] = amd_bytealign (w1[1], w1[2], offset); w2[1] = amd_bytealign (w1[0], w1[1], offset); w2[0] = amd_bytealign (w0[3], w1[0], offset); w1[3] = amd_bytealign (w0[2], w0[3], offset); w1[2] = amd_bytealign (w0[1], w0[2], offset); w1[1] = amd_bytealign (w0[0], w0[1], offset); w1[0] = amd_bytealign ( 0, w0[0], offset); w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 5: c1[1] = amd_bytealign (w3[3], 0, offset); c1[0] = amd_bytealign (w3[2], w3[3], offset); c0[3] = amd_bytealign (w3[1], w3[2], offset); c0[2] = amd_bytealign (w3[0], w3[1], offset); c0[1] = amd_bytealign (w2[3], w3[0], offset); c0[0] = amd_bytealign (w2[2], w2[3], offset); w3[3] = amd_bytealign (w2[1], w2[2], offset); w3[2] = amd_bytealign (w2[0], w2[1], offset); w3[1] = amd_bytealign (w1[3], w2[0], offset); w3[0] = amd_bytealign (w1[2], w1[3], offset); w2[3] = amd_bytealign (w1[1], w1[2], offset); w2[2] = amd_bytealign (w1[0], w1[1], offset); w2[1] = amd_bytealign (w0[3], w1[0], offset); w2[0] = amd_bytealign (w0[2], w0[3], offset); w1[3] = amd_bytealign (w0[1], w0[2], offset); w1[2] = amd_bytealign (w0[0], w0[1], offset); w1[1] = amd_bytealign ( 0, w0[0], offset); w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 6: c1[2] = amd_bytealign (w3[3], 0, offset); c1[1] = amd_bytealign (w3[2], w3[3], offset); c1[0] = amd_bytealign (w3[1], w3[2], offset); c0[3] = amd_bytealign (w3[0], w3[1], offset); c0[2] = amd_bytealign (w2[3], w3[0], offset); c0[1] = amd_bytealign (w2[2], w2[3], offset); c0[0] = amd_bytealign (w2[1], w2[2], offset); w3[3] = amd_bytealign (w2[0], w2[1], offset); w3[2] = amd_bytealign (w1[3], w2[0], offset); w3[1] = amd_bytealign (w1[2], w1[3], offset); w3[0] = amd_bytealign (w1[1], w1[2], offset); w2[3] = amd_bytealign (w1[0], w1[1], offset); w2[2] = amd_bytealign (w0[3], w1[0], offset); w2[1] = amd_bytealign (w0[2], w0[3], offset); w2[0] = amd_bytealign (w0[1], w0[2], offset); w1[3] = amd_bytealign (w0[0], w0[1], offset); w1[2] = amd_bytealign ( 0, w0[0], offset); w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 7: c1[3] = amd_bytealign (w3[3], 0, offset); c1[2] = amd_bytealign (w3[2], w3[3], offset); c1[1] = amd_bytealign (w3[1], w3[2], offset); c1[0] = amd_bytealign (w3[0], w3[1], offset); c0[3] = amd_bytealign (w2[3], w3[0], offset); c0[2] = amd_bytealign (w2[2], w2[3], offset); c0[1] = amd_bytealign (w2[1], w2[2], offset); c0[0] = amd_bytealign (w2[0], w2[1], offset); w3[3] = amd_bytealign (w1[3], w2[0], offset); w3[2] = amd_bytealign (w1[2], w1[3], offset); w3[1] = amd_bytealign (w1[1], w1[2], offset); w3[0] = amd_bytealign (w1[0], w1[1], offset); w2[3] = amd_bytealign (w0[3], w1[0], offset); w2[2] = amd_bytealign (w0[2], w0[3], offset); w2[1] = amd_bytealign (w0[1], w0[2], offset); w2[0] = amd_bytealign (w0[0], w0[1], offset); w1[3] = amd_bytealign ( 0, w0[0], offset); w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 8: c2[0] = amd_bytealign (w3[3], 0, offset); c1[3] = amd_bytealign (w3[2], w3[3], offset); c1[2] = amd_bytealign (w3[1], w3[2], offset); c1[1] = amd_bytealign (w3[0], w3[1], offset); c1[0] = amd_bytealign (w2[3], w3[0], offset); c0[3] = amd_bytealign (w2[2], w2[3], offset); c0[2] = amd_bytealign (w2[1], w2[2], offset); c0[1] = amd_bytealign (w2[0], w2[1], offset); c0[0] = amd_bytealign (w1[3], w2[0], offset); w3[3] = amd_bytealign (w1[2], w1[3], offset); w3[2] = amd_bytealign (w1[1], w1[2], offset); w3[1] = amd_bytealign (w1[0], w1[1], offset); w3[0] = amd_bytealign (w0[3], w1[0], offset); w2[3] = amd_bytealign (w0[2], w0[3], offset); w2[2] = amd_bytealign (w0[1], w0[2], offset); w2[1] = amd_bytealign (w0[0], w0[1], offset); w2[0] = amd_bytealign ( 0, w0[0], offset); w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 9: c2[1] = amd_bytealign (w3[3], 0, offset); c2[0] = amd_bytealign (w3[2], w3[3], offset); c1[3] = amd_bytealign (w3[1], w3[2], offset); c1[2] = amd_bytealign (w3[0], w3[1], offset); c1[1] = amd_bytealign (w2[3], w3[0], offset); c1[0] = amd_bytealign (w2[2], w2[3], offset); c0[3] = amd_bytealign (w2[1], w2[2], offset); c0[2] = amd_bytealign (w2[0], w2[1], offset); c0[1] = amd_bytealign (w1[3], w2[0], offset); c0[0] = amd_bytealign (w1[2], w1[3], offset); w3[3] = amd_bytealign (w1[1], w1[2], offset); w3[2] = amd_bytealign (w1[0], w1[1], offset); w3[1] = amd_bytealign (w0[3], w1[0], offset); w3[0] = amd_bytealign (w0[2], w0[3], offset); w2[3] = amd_bytealign (w0[1], w0[2], offset); w2[2] = amd_bytealign (w0[0], w0[1], offset); w2[1] = amd_bytealign ( 0, w0[0], offset); w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 10: c2[2] = amd_bytealign (w3[3], 0, offset); c2[1] = amd_bytealign (w3[2], w3[3], offset); c2[0] = amd_bytealign (w3[1], w3[2], offset); c1[3] = amd_bytealign (w3[0], w3[1], offset); c1[2] = amd_bytealign (w2[3], w3[0], offset); c1[1] = amd_bytealign (w2[2], w2[3], offset); c1[0] = amd_bytealign (w2[1], w2[2], offset); c0[3] = amd_bytealign (w2[0], w2[1], offset); c0[2] = amd_bytealign (w1[3], w2[0], offset); c0[1] = amd_bytealign (w1[2], w1[3], offset); c0[0] = amd_bytealign (w1[1], w1[2], offset); w3[3] = amd_bytealign (w1[0], w1[1], offset); w3[2] = amd_bytealign (w0[3], w1[0], offset); w3[1] = amd_bytealign (w0[2], w0[3], offset); w3[0] = amd_bytealign (w0[1], w0[2], offset); w2[3] = amd_bytealign (w0[0], w0[1], offset); w2[2] = amd_bytealign ( 0, w0[0], offset); w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 11: c2[3] = amd_bytealign (w3[3], 0, offset); c2[2] = amd_bytealign (w3[2], w3[3], offset); c2[1] = amd_bytealign (w3[1], w3[2], offset); c2[0] = amd_bytealign (w3[0], w3[1], offset); c1[3] = amd_bytealign (w2[3], w3[0], offset); c1[2] = amd_bytealign (w2[2], w2[3], offset); c1[1] = amd_bytealign (w2[1], w2[2], offset); c1[0] = amd_bytealign (w2[0], w2[1], offset); c0[3] = amd_bytealign (w1[3], w2[0], offset); c0[2] = amd_bytealign (w1[2], w1[3], offset); c0[1] = amd_bytealign (w1[1], w1[2], offset); c0[0] = amd_bytealign (w1[0], w1[1], offset); w3[3] = amd_bytealign (w0[3], w1[0], offset); w3[2] = amd_bytealign (w0[2], w0[3], offset); w3[1] = amd_bytealign (w0[1], w0[2], offset); w3[0] = amd_bytealign (w0[0], w0[1], offset); w2[3] = amd_bytealign ( 0, w0[0], offset); w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 12: c3[0] = amd_bytealign (w3[3], 0, offset); c2[3] = amd_bytealign (w3[2], w3[3], offset); c2[2] = amd_bytealign (w3[1], w3[2], offset); c2[1] = amd_bytealign (w3[0], w3[1], offset); c2[0] = amd_bytealign (w2[3], w3[0], offset); c1[3] = amd_bytealign (w2[2], w2[3], offset); c1[2] = amd_bytealign (w2[1], w2[2], offset); c1[1] = amd_bytealign (w2[0], w2[1], offset); c1[0] = amd_bytealign (w1[3], w2[0], offset); c0[3] = amd_bytealign (w1[2], w1[3], offset); c0[2] = amd_bytealign (w1[1], w1[2], offset); c0[1] = amd_bytealign (w1[0], w1[1], offset); c0[0] = amd_bytealign (w0[3], w1[0], offset); w3[3] = amd_bytealign (w0[2], w0[3], offset); w3[2] = amd_bytealign (w0[1], w0[2], offset); w3[1] = amd_bytealign (w0[0], w0[1], offset); w3[0] = amd_bytealign ( 0, w0[0], offset); w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 13: c3[1] = amd_bytealign (w3[3], 0, offset); c3[0] = amd_bytealign (w3[2], w3[3], offset); c2[3] = amd_bytealign (w3[1], w3[2], offset); c2[2] = amd_bytealign (w3[0], w3[1], offset); c2[1] = amd_bytealign (w2[3], w3[0], offset); c2[0] = amd_bytealign (w2[2], w2[3], offset); c1[3] = amd_bytealign (w2[1], w2[2], offset); c1[2] = amd_bytealign (w2[0], w2[1], offset); c1[1] = amd_bytealign (w1[3], w2[0], offset); c1[0] = amd_bytealign (w1[2], w1[3], offset); c0[3] = amd_bytealign (w1[1], w1[2], offset); c0[2] = amd_bytealign (w1[0], w1[1], offset); c0[1] = amd_bytealign (w0[3], w1[0], offset); c0[0] = amd_bytealign (w0[2], w0[3], offset); w3[3] = amd_bytealign (w0[1], w0[2], offset); w3[2] = amd_bytealign (w0[0], w0[1], offset); w3[1] = amd_bytealign ( 0, w0[0], offset); w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 14: c3[2] = amd_bytealign (w3[3], 0, offset); c3[1] = amd_bytealign (w3[2], w3[3], offset); c3[0] = amd_bytealign (w3[1], w3[2], offset); c2[3] = amd_bytealign (w3[0], w3[1], offset); c2[2] = amd_bytealign (w2[3], w3[0], offset); c2[1] = amd_bytealign (w2[2], w2[3], offset); c2[0] = amd_bytealign (w2[1], w2[2], offset); c1[3] = amd_bytealign (w2[0], w2[1], offset); c1[2] = amd_bytealign (w1[3], w2[0], offset); c1[1] = amd_bytealign (w1[2], w1[3], offset); c1[0] = amd_bytealign (w1[1], w1[2], offset); c0[3] = amd_bytealign (w1[0], w1[1], offset); c0[2] = amd_bytealign (w0[3], w1[0], offset); c0[1] = amd_bytealign (w0[2], w0[3], offset); c0[0] = amd_bytealign (w0[1], w0[2], offset); w3[3] = amd_bytealign (w0[0], w0[1], offset); w3[2] = amd_bytealign ( 0, w0[0], offset); w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 15: c3[3] = amd_bytealign (w3[3], 0, offset); c3[2] = amd_bytealign (w3[2], w3[3], offset); c3[1] = amd_bytealign (w3[1], w3[2], offset); c3[0] = amd_bytealign (w3[0], w3[1], offset); c2[3] = amd_bytealign (w2[3], w3[0], offset); c2[2] = amd_bytealign (w2[2], w2[3], offset); c2[1] = amd_bytealign (w2[1], w2[2], offset); c2[0] = amd_bytealign (w2[0], w2[1], offset); c1[3] = amd_bytealign (w1[3], w2[0], offset); c1[2] = amd_bytealign (w1[2], w1[3], offset); c1[1] = amd_bytealign (w1[1], w1[2], offset); c1[0] = amd_bytealign (w1[0], w1[1], offset); c0[3] = amd_bytealign (w0[3], w1[0], offset); c0[2] = amd_bytealign (w0[2], w0[3], offset); c0[1] = amd_bytealign (w0[1], w0[2], offset); c0[0] = amd_bytealign (w0[0], w0[1], offset); w3[3] = amd_bytealign ( 0, w0[0], offset); w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; } w0[0] = swap32 (w0[0]); w0[1] = swap32 (w0[1]); w0[2] = swap32 (w0[2]); w0[3] = swap32 (w0[3]); w1[0] = swap32 (w1[0]); w1[1] = swap32 (w1[1]); w1[2] = swap32 (w1[2]); w1[3] = swap32 (w1[3]); w2[0] = swap32 (w2[0]); w2[1] = swap32 (w2[1]); w2[2] = swap32 (w2[2]); w2[3] = swap32 (w2[3]); w3[0] = swap32 (w3[0]); w3[1] = swap32 (w3[1]); w3[2] = swap32 (w3[2]); w3[3] = swap32 (w3[3]); c0[0] = swap32 (c0[0]); c0[1] = swap32 (c0[1]); c0[2] = swap32 (c0[2]); c0[3] = swap32 (c0[3]); c1[0] = swap32 (c1[0]); c1[1] = swap32 (c1[1]); c1[2] = swap32 (c1[2]); c1[3] = swap32 (c1[3]); c2[0] = swap32 (c2[0]); c2[1] = swap32 (c2[1]); c2[2] = swap32 (c2[2]); c2[3] = swap32 (c2[3]); c3[0] = swap32 (c3[0]); c3[1] = swap32 (c3[1]); c3[2] = swap32 (c3[2]); c3[3] = swap32 (c3[3]); #endif #ifdef IS_NV // todo switch (offset_switch) { case 0: c0[0] = amd_bytealign ( 0, w3[3], offset_minus_4); w3[3] = amd_bytealign (w3[3], w3[2], offset_minus_4); w3[2] = amd_bytealign (w3[2], w3[1], offset_minus_4); w3[1] = amd_bytealign (w3[1], w3[0], offset_minus_4); w3[0] = amd_bytealign (w3[0], w2[3], offset_minus_4); w2[3] = amd_bytealign (w2[3], w2[2], offset_minus_4); w2[2] = amd_bytealign (w2[2], w2[1], offset_minus_4); w2[1] = amd_bytealign (w2[1], w2[0], offset_minus_4); w2[0] = amd_bytealign (w2[0], w1[3], offset_minus_4); w1[3] = amd_bytealign (w1[3], w1[2], offset_minus_4); w1[2] = amd_bytealign (w1[2], w1[1], offset_minus_4); w1[1] = amd_bytealign (w1[1], w1[0], offset_minus_4); w1[0] = amd_bytealign (w1[0], w0[3], offset_minus_4); w0[3] = amd_bytealign (w0[3], w0[2], offset_minus_4); w0[2] = amd_bytealign (w0[2], w0[1], offset_minus_4); w0[1] = amd_bytealign (w0[1], w0[0], offset_minus_4); w0[0] = amd_bytealign (w0[0], 0, offset_minus_4); if (offset_mod_4 == 0) { w0[0] = w0[1]; w0[1] = w0[2]; w0[2] = w0[3]; w0[3] = w1[0]; w1[0] = w1[1]; w1[1] = w1[2]; w1[2] = w1[3]; w1[3] = w2[0]; w2[0] = w2[1]; w2[1] = w2[2]; w2[2] = w2[3]; w2[3] = w3[0]; w3[0] = w3[1]; w3[1] = w3[2]; w3[2] = w3[3]; w3[3] = c0[0]; c0[0] = 0; } break; case 1: c0[1] = amd_bytealign ( 0, w3[3], offset_minus_4); c0[0] = amd_bytealign (w3[3], w3[2], offset_minus_4); w3[3] = amd_bytealign (w3[2], w3[1], offset_minus_4); w3[2] = amd_bytealign (w3[1], w3[0], offset_minus_4); w3[1] = amd_bytealign (w3[0], w2[3], offset_minus_4); w3[0] = amd_bytealign (w2[3], w2[2], offset_minus_4); w2[3] = amd_bytealign (w2[2], w2[1], offset_minus_4); w2[2] = amd_bytealign (w2[1], w2[0], offset_minus_4); w2[1] = amd_bytealign (w2[0], w1[3], offset_minus_4); w2[0] = amd_bytealign (w1[3], w1[2], offset_minus_4); w1[3] = amd_bytealign (w1[2], w1[1], offset_minus_4); w1[2] = amd_bytealign (w1[1], w1[0], offset_minus_4); w1[1] = amd_bytealign (w1[0], w0[3], offset_minus_4); w1[0] = amd_bytealign (w0[3], w0[2], offset_minus_4); w0[3] = amd_bytealign (w0[2], w0[1], offset_minus_4); w0[2] = amd_bytealign (w0[1], w0[0], offset_minus_4); w0[1] = amd_bytealign (w0[0], 0, offset_minus_4); w0[0] = 0; if (offset_mod_4 == 0) { w0[1] = w0[2]; w0[2] = w0[3]; w0[3] = w1[0]; w1[0] = w1[1]; w1[1] = w1[2]; w1[2] = w1[3]; w1[3] = w2[0]; w2[0] = w2[1]; w2[1] = w2[2]; w2[2] = w2[3]; w2[3] = w3[0]; w3[0] = w3[1]; w3[1] = w3[2]; w3[2] = w3[3]; w3[3] = c0[0]; c0[0] = c0[1]; c0[1] = 0; } break; case 2: c0[2] = amd_bytealign ( 0, w3[3], offset_minus_4); c0[1] = amd_bytealign (w3[3], w3[2], offset_minus_4); c0[0] = amd_bytealign (w3[2], w3[1], offset_minus_4); w3[3] = amd_bytealign (w3[1], w3[0], offset_minus_4); w3[2] = amd_bytealign (w3[0], w2[3], offset_minus_4); w3[1] = amd_bytealign (w2[3], w2[2], offset_minus_4); w3[0] = amd_bytealign (w2[2], w2[1], offset_minus_4); w2[3] = amd_bytealign (w2[1], w2[0], offset_minus_4); w2[2] = amd_bytealign (w2[0], w1[3], offset_minus_4); w2[1] = amd_bytealign (w1[3], w1[2], offset_minus_4); w2[0] = amd_bytealign (w1[2], w1[1], offset_minus_4); w1[3] = amd_bytealign (w1[1], w1[0], offset_minus_4); w1[2] = amd_bytealign (w1[0], w0[3], offset_minus_4); w1[1] = amd_bytealign (w0[3], w0[2], offset_minus_4); w1[0] = amd_bytealign (w0[2], w0[1], offset_minus_4); w0[3] = amd_bytealign (w0[1], w0[0], offset_minus_4); w0[2] = amd_bytealign (w0[0], 0, offset_minus_4); w0[1] = 0; w0[0] = 0; if (offset_mod_4 == 0) { w0[2] = w0[3]; w0[3] = w1[0]; w1[0] = w1[1]; w1[1] = w1[2]; w1[2] = w1[3]; w1[3] = w2[0]; w2[0] = w2[1]; w2[1] = w2[2]; w2[2] = w2[3]; w2[3] = w3[0]; w3[0] = w3[1]; w3[1] = w3[2]; w3[2] = w3[3]; w3[3] = c0[0]; c0[0] = c0[1]; c0[1] = c0[2]; c0[2] = 0; } break; case 3: c0[3] = amd_bytealign ( 0, w3[3], offset_minus_4); c0[2] = amd_bytealign (w3[3], w3[2], offset_minus_4); c0[1] = amd_bytealign (w3[2], w3[1], offset_minus_4); c0[0] = amd_bytealign (w3[1], w3[0], offset_minus_4); w3[3] = amd_bytealign (w3[0], w2[3], offset_minus_4); w3[2] = amd_bytealign (w2[3], w2[2], offset_minus_4); w3[1] = amd_bytealign (w2[2], w2[1], offset_minus_4); w3[0] = amd_bytealign (w2[1], w2[0], offset_minus_4); w2[3] = amd_bytealign (w2[0], w1[3], offset_minus_4); w2[2] = amd_bytealign (w1[3], w1[2], offset_minus_4); w2[1] = amd_bytealign (w1[2], w1[1], offset_minus_4); w2[0] = amd_bytealign (w1[1], w1[0], offset_minus_4); w1[3] = amd_bytealign (w1[0], w0[3], offset_minus_4); w1[2] = amd_bytealign (w0[3], w0[2], offset_minus_4); w1[1] = amd_bytealign (w0[2], w0[1], offset_minus_4); w1[0] = amd_bytealign (w0[1], w0[0], offset_minus_4); w0[3] = amd_bytealign (w0[0], 0, offset_minus_4); w0[2] = 0; w0[1] = 0; w0[0] = 0; if (offset_mod_4 == 0) { w0[3] = w1[0]; w1[0] = w1[1]; w1[1] = w1[2]; w1[2] = w1[3]; w1[3] = w2[0]; w2[0] = w2[1]; w2[1] = w2[2]; w2[2] = w2[3]; w2[3] = w3[0]; w3[0] = w3[1]; w3[1] = w3[2]; w3[2] = w3[3]; w3[3] = c0[0]; c0[0] = c0[1]; c0[1] = c0[2]; c0[2] = c0[3]; c0[3] = 0; } break; case 4: c1[0] = amd_bytealign ( 0, w3[3], offset_minus_4); c0[3] = amd_bytealign (w3[3], w3[2], offset_minus_4); c0[2] = amd_bytealign (w3[2], w3[1], offset_minus_4); c0[1] = amd_bytealign (w3[1], w3[0], offset_minus_4); c0[0] = amd_bytealign (w3[0], w2[3], offset_minus_4); w3[3] = amd_bytealign (w2[3], w2[2], offset_minus_4); w3[2] = amd_bytealign (w2[2], w2[1], offset_minus_4); w3[1] = amd_bytealign (w2[1], w2[0], offset_minus_4); w3[0] = amd_bytealign (w2[0], w1[3], offset_minus_4); w2[3] = amd_bytealign (w1[3], w1[2], offset_minus_4); w2[2] = amd_bytealign (w1[2], w1[1], offset_minus_4); w2[1] = amd_bytealign (w1[1], w1[0], offset_minus_4); w2[0] = amd_bytealign (w1[0], w0[3], offset_minus_4); w1[3] = amd_bytealign (w0[3], w0[2], offset_minus_4); w1[2] = amd_bytealign (w0[2], w0[1], offset_minus_4); w1[1] = amd_bytealign (w0[1], w0[0], offset_minus_4); w1[0] = amd_bytealign (w0[0], 0, offset_minus_4); w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; if (offset_mod_4 == 0) { w1[0] = w1[1]; w1[1] = w1[2]; w1[2] = w1[3]; w1[3] = w2[0]; w2[0] = w2[1]; w2[1] = w2[2]; w2[2] = w2[3]; w2[3] = w3[0]; w3[0] = w3[1]; w3[1] = w3[2]; w3[2] = w3[3]; w3[3] = c0[0]; c0[0] = c0[1]; c0[1] = c0[2]; c0[2] = c0[3]; c0[3] = c1[0]; c1[0] = 0; } break; case 5: c1[1] = amd_bytealign ( 0, w3[3], offset_minus_4); c1[0] = amd_bytealign (w3[3], w3[2], offset_minus_4); c0[3] = amd_bytealign (w3[2], w3[1], offset_minus_4); c0[2] = amd_bytealign (w3[1], w3[0], offset_minus_4); c0[1] = amd_bytealign (w3[0], w2[3], offset_minus_4); c0[0] = amd_bytealign (w2[3], w2[2], offset_minus_4); w3[3] = amd_bytealign (w2[2], w2[1], offset_minus_4); w3[2] = amd_bytealign (w2[1], w2[0], offset_minus_4); w3[1] = amd_bytealign (w2[0], w1[3], offset_minus_4); w3[0] = amd_bytealign (w1[3], w1[2], offset_minus_4); w2[3] = amd_bytealign (w1[2], w1[1], offset_minus_4); w2[2] = amd_bytealign (w1[1], w1[0], offset_minus_4); w2[1] = amd_bytealign (w1[0], w0[3], offset_minus_4); w2[0] = amd_bytealign (w0[3], w0[2], offset_minus_4); w1[3] = amd_bytealign (w0[2], w0[1], offset_minus_4); w1[2] = amd_bytealign (w0[1], w0[0], offset_minus_4); w1[1] = amd_bytealign (w0[0], 0, offset_minus_4); w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; if (offset_mod_4 == 0) { w1[1] = w1[2]; w1[2] = w1[3]; w1[3] = w2[0]; w2[0] = w2[1]; w2[1] = w2[2]; w2[2] = w2[3]; w2[3] = w3[0]; w3[0] = w3[1]; w3[1] = w3[2]; w3[2] = w3[3]; w3[3] = c0[0]; c0[0] = c0[1]; c0[1] = c0[2]; c0[2] = c0[3]; c0[3] = c1[0]; c1[0] = c1[1]; c1[1] = 0; } break; case 6: c1[2] = amd_bytealign ( 0, w3[3], offset_minus_4); c1[1] = amd_bytealign (w3[3], w3[2], offset_minus_4); c1[0] = amd_bytealign (w3[2], w3[1], offset_minus_4); c0[3] = amd_bytealign (w3[1], w3[0], offset_minus_4); c0[2] = amd_bytealign (w3[0], w2[3], offset_minus_4); c0[1] = amd_bytealign (w2[3], w2[2], offset_minus_4); c0[0] = amd_bytealign (w2[2], w2[1], offset_minus_4); w3[3] = amd_bytealign (w2[1], w2[0], offset_minus_4); w3[2] = amd_bytealign (w2[0], w1[3], offset_minus_4); w3[1] = amd_bytealign (w1[3], w1[2], offset_minus_4); w3[0] = amd_bytealign (w1[2], w1[1], offset_minus_4); w2[3] = amd_bytealign (w1[1], w1[0], offset_minus_4); w2[2] = amd_bytealign (w1[0], w0[3], offset_minus_4); w2[1] = amd_bytealign (w0[3], w0[2], offset_minus_4); w2[0] = amd_bytealign (w0[2], w0[1], offset_minus_4); w1[3] = amd_bytealign (w0[1], w0[0], offset_minus_4); w1[2] = amd_bytealign (w0[0], 0, offset_minus_4); w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; if (offset_mod_4 == 0) { w1[2] = w1[3]; w1[3] = w2[0]; w2[0] = w2[1]; w2[1] = w2[2]; w2[2] = w2[3]; w2[3] = w3[0]; w3[0] = w3[1]; w3[1] = w3[2]; w3[2] = w3[3]; w3[3] = c0[0]; c0[0] = c0[1]; c0[1] = c0[2]; c0[2] = c0[3]; c0[3] = c1[0]; c1[0] = c1[1]; c1[1] = c1[2]; c1[2] = 0; } break; case 7: c1[3] = amd_bytealign ( 0, w3[3], offset_minus_4); c1[2] = amd_bytealign (w3[3], w3[2], offset_minus_4); c1[1] = amd_bytealign (w3[2], w3[1], offset_minus_4); c1[0] = amd_bytealign (w3[1], w3[0], offset_minus_4); c0[3] = amd_bytealign (w3[0], w2[3], offset_minus_4); c0[2] = amd_bytealign (w2[3], w2[2], offset_minus_4); c0[1] = amd_bytealign (w2[2], w2[1], offset_minus_4); c0[0] = amd_bytealign (w2[1], w2[0], offset_minus_4); w3[3] = amd_bytealign (w2[0], w1[3], offset_minus_4); w3[2] = amd_bytealign (w1[3], w1[2], offset_minus_4); w3[1] = amd_bytealign (w1[2], w1[1], offset_minus_4); w3[0] = amd_bytealign (w1[1], w1[0], offset_minus_4); w2[3] = amd_bytealign (w1[0], w0[3], offset_minus_4); w2[2] = amd_bytealign (w0[3], w0[2], offset_minus_4); w2[1] = amd_bytealign (w0[2], w0[1], offset_minus_4); w2[0] = amd_bytealign (w0[1], w0[0], offset_minus_4); w1[3] = amd_bytealign (w0[0], 0, offset_minus_4); w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; if (offset_mod_4 == 0) { w1[3] = w2[0]; w2[0] = w2[1]; w2[1] = w2[2]; w2[2] = w2[3]; w2[3] = w3[0]; w3[0] = w3[1]; w3[1] = w3[2]; w3[2] = w3[3]; w3[3] = c0[0]; c0[0] = c0[1]; c0[1] = c0[2]; c0[2] = c0[3]; c0[3] = c1[0]; c1[0] = c1[1]; c1[1] = c1[2]; c1[2] = c1[3]; c1[3] = 0; } break; case 8: c2[0] = amd_bytealign ( 0, w3[3], offset_minus_4); c1[3] = amd_bytealign (w3[3], w3[2], offset_minus_4); c1[2] = amd_bytealign (w3[2], w3[1], offset_minus_4); c1[1] = amd_bytealign (w3[1], w3[0], offset_minus_4); c1[0] = amd_bytealign (w3[0], w2[3], offset_minus_4); c0[3] = amd_bytealign (w2[3], w2[2], offset_minus_4); c0[2] = amd_bytealign (w2[2], w2[1], offset_minus_4); c0[1] = amd_bytealign (w2[1], w2[0], offset_minus_4); c0[0] = amd_bytealign (w2[0], w1[3], offset_minus_4); w3[3] = amd_bytealign (w1[3], w1[2], offset_minus_4); w3[2] = amd_bytealign (w1[2], w1[1], offset_minus_4); w3[1] = amd_bytealign (w1[1], w1[0], offset_minus_4); w3[0] = amd_bytealign (w1[0], w0[3], offset_minus_4); w2[3] = amd_bytealign (w0[3], w0[2], offset_minus_4); w2[2] = amd_bytealign (w0[2], w0[1], offset_minus_4); w2[1] = amd_bytealign (w0[1], w0[0], offset_minus_4); w2[0] = amd_bytealign (w0[0], 0, offset_minus_4); w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; if (offset_mod_4 == 0) { w2[0] = w2[1]; w2[1] = w2[2]; w2[2] = w2[3]; w2[3] = w3[0]; w3[0] = w3[1]; w3[1] = w3[2]; w3[2] = w3[3]; w3[3] = c0[0]; c0[0] = c0[1]; c0[1] = c0[2]; c0[2] = c0[3]; c0[3] = c1[0]; c1[0] = c1[1]; c1[1] = c1[2]; c1[2] = c1[3]; c1[3] = c2[0]; c2[0] = 0; } break; case 9: c2[1] = amd_bytealign ( 0, w3[3], offset_minus_4); c2[0] = amd_bytealign (w3[3], w3[2], offset_minus_4); c1[3] = amd_bytealign (w3[2], w3[1], offset_minus_4); c1[2] = amd_bytealign (w3[1], w3[0], offset_minus_4); c1[1] = amd_bytealign (w3[0], w2[3], offset_minus_4); c1[0] = amd_bytealign (w2[3], w2[2], offset_minus_4); c0[3] = amd_bytealign (w2[2], w2[1], offset_minus_4); c0[2] = amd_bytealign (w2[1], w2[0], offset_minus_4); c0[1] = amd_bytealign (w2[0], w1[3], offset_minus_4); c0[0] = amd_bytealign (w1[3], w1[2], offset_minus_4); w3[3] = amd_bytealign (w1[2], w1[1], offset_minus_4); w3[2] = amd_bytealign (w1[1], w1[0], offset_minus_4); w3[1] = amd_bytealign (w1[0], w0[3], offset_minus_4); w3[0] = amd_bytealign (w0[3], w0[2], offset_minus_4); w2[3] = amd_bytealign (w0[2], w0[1], offset_minus_4); w2[2] = amd_bytealign (w0[1], w0[0], offset_minus_4); w2[1] = amd_bytealign (w0[0], 0, offset_minus_4); w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; if (offset_mod_4 == 0) { w2[1] = w2[2]; w2[2] = w2[3]; w2[3] = w3[0]; w3[0] = w3[1]; w3[1] = w3[2]; w3[2] = w3[3]; w3[3] = c0[0]; c0[0] = c0[1]; c0[1] = c0[2]; c0[2] = c0[3]; c0[3] = c1[0]; c1[0] = c1[1]; c1[1] = c1[2]; c1[2] = c1[3]; c1[3] = c2[0]; c2[0] = c2[1]; c2[1] = 0; } break; case 10: c2[2] = amd_bytealign ( 0, w3[3], offset_minus_4); c2[1] = amd_bytealign (w3[3], w3[2], offset_minus_4); c2[0] = amd_bytealign (w3[2], w3[1], offset_minus_4); c1[3] = amd_bytealign (w3[1], w3[0], offset_minus_4); c1[2] = amd_bytealign (w3[0], w2[3], offset_minus_4); c1[1] = amd_bytealign (w2[3], w2[2], offset_minus_4); c1[0] = amd_bytealign (w2[2], w2[1], offset_minus_4); c0[3] = amd_bytealign (w2[1], w2[0], offset_minus_4); c0[2] = amd_bytealign (w2[0], w1[3], offset_minus_4); c0[1] = amd_bytealign (w1[3], w1[2], offset_minus_4); c0[0] = amd_bytealign (w1[2], w1[1], offset_minus_4); w3[3] = amd_bytealign (w1[1], w1[0], offset_minus_4); w3[2] = amd_bytealign (w1[0], w0[3], offset_minus_4); w3[1] = amd_bytealign (w0[3], w0[2], offset_minus_4); w3[0] = amd_bytealign (w0[2], w0[1], offset_minus_4); w2[3] = amd_bytealign (w0[1], w0[0], offset_minus_4); w2[2] = amd_bytealign (w0[0], 0, offset_minus_4); w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; if (offset_mod_4 == 0) { w2[2] = w2[3]; w2[3] = w3[0]; w3[0] = w3[1]; w3[1] = w3[2]; w3[2] = w3[3]; w3[3] = c0[0]; c0[0] = c0[1]; c0[1] = c0[2]; c0[2] = c0[3]; c0[3] = c1[0]; c1[0] = c1[1]; c1[1] = c1[2]; c1[2] = c1[3]; c1[3] = c2[0]; c2[0] = c2[1]; c2[1] = c2[2]; c2[2] = 0; } break; case 11: c2[3] = amd_bytealign ( 0, w3[3], offset_minus_4); c2[2] = amd_bytealign (w3[3], w3[2], offset_minus_4); c2[1] = amd_bytealign (w3[2], w3[1], offset_minus_4); c2[0] = amd_bytealign (w3[1], w3[0], offset_minus_4); c1[3] = amd_bytealign (w3[0], w2[3], offset_minus_4); c1[2] = amd_bytealign (w2[3], w2[2], offset_minus_4); c1[1] = amd_bytealign (w2[2], w2[1], offset_minus_4); c1[0] = amd_bytealign (w2[1], w2[0], offset_minus_4); c0[3] = amd_bytealign (w2[0], w1[3], offset_minus_4); c0[2] = amd_bytealign (w1[3], w1[2], offset_minus_4); c0[1] = amd_bytealign (w1[2], w1[1], offset_minus_4); c0[0] = amd_bytealign (w1[1], w1[0], offset_minus_4); w3[3] = amd_bytealign (w1[0], w0[3], offset_minus_4); w3[2] = amd_bytealign (w0[3], w0[2], offset_minus_4); w3[1] = amd_bytealign (w0[2], w0[1], offset_minus_4); w3[0] = amd_bytealign (w0[1], w0[0], offset_minus_4); w2[3] = amd_bytealign (w0[0], 0, offset_minus_4); w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; if (offset_mod_4 == 0) { w2[3] = w3[0]; w3[0] = w3[1]; w3[1] = w3[2]; w3[2] = w3[3]; w3[3] = c0[0]; c0[0] = c0[1]; c0[1] = c0[2]; c0[2] = c0[3]; c0[3] = c1[0]; c1[0] = c1[1]; c1[1] = c1[2]; c1[2] = c1[3]; c1[3] = c2[0]; c2[0] = c2[1]; c2[1] = c2[2]; c2[2] = c2[3]; c2[3] = 0; } break; case 12: c3[0] = amd_bytealign ( 0, w3[3], offset_minus_4); c2[3] = amd_bytealign (w3[3], w3[2], offset_minus_4); c2[2] = amd_bytealign (w3[2], w3[1], offset_minus_4); c2[1] = amd_bytealign (w3[1], w3[0], offset_minus_4); c2[0] = amd_bytealign (w3[0], w2[3], offset_minus_4); c1[3] = amd_bytealign (w2[3], w2[2], offset_minus_4); c1[2] = amd_bytealign (w2[2], w2[1], offset_minus_4); c1[1] = amd_bytealign (w2[1], w2[0], offset_minus_4); c1[0] = amd_bytealign (w2[0], w1[3], offset_minus_4); c0[3] = amd_bytealign (w1[3], w1[2], offset_minus_4); c0[2] = amd_bytealign (w1[2], w1[1], offset_minus_4); c0[1] = amd_bytealign (w1[1], w1[0], offset_minus_4); c0[0] = amd_bytealign (w1[0], w0[3], offset_minus_4); w3[3] = amd_bytealign (w0[3], w0[2], offset_minus_4); w3[2] = amd_bytealign (w0[2], w0[1], offset_minus_4); w3[1] = amd_bytealign (w0[1], w0[0], offset_minus_4); w3[0] = amd_bytealign (w0[0], 0, offset_minus_4); w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; if (offset_mod_4 == 0) { w3[0] = w3[1]; w3[1] = w3[2]; w3[2] = w3[3]; w3[3] = c0[0]; c0[0] = c0[1]; c0[1] = c0[2]; c0[2] = c0[3]; c0[3] = c1[0]; c1[0] = c1[1]; c1[1] = c1[2]; c1[2] = c1[3]; c1[3] = c2[0]; c2[0] = c2[1]; c2[1] = c2[2]; c2[2] = c2[3]; c2[3] = c3[0]; c3[0] = 0; } break; case 13: c3[1] = amd_bytealign ( 0, w3[3], offset_minus_4); c3[0] = amd_bytealign (w3[3], w3[2], offset_minus_4); c2[3] = amd_bytealign (w3[2], w3[1], offset_minus_4); c2[2] = amd_bytealign (w3[1], w3[0], offset_minus_4); c2[1] = amd_bytealign (w3[0], w2[3], offset_minus_4); c2[0] = amd_bytealign (w2[3], w2[2], offset_minus_4); c1[3] = amd_bytealign (w2[2], w2[1], offset_minus_4); c1[2] = amd_bytealign (w2[1], w2[0], offset_minus_4); c1[1] = amd_bytealign (w2[0], w1[3], offset_minus_4); c1[0] = amd_bytealign (w1[3], w1[2], offset_minus_4); c0[3] = amd_bytealign (w1[2], w1[1], offset_minus_4); c0[2] = amd_bytealign (w1[1], w1[0], offset_minus_4); c0[1] = amd_bytealign (w1[0], w0[3], offset_minus_4); c0[0] = amd_bytealign (w0[3], w0[2], offset_minus_4); w3[3] = amd_bytealign (w0[2], w0[1], offset_minus_4); w3[2] = amd_bytealign (w0[1], w0[0], offset_minus_4); w3[1] = amd_bytealign (w0[0], 0, offset_minus_4); w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; if (offset_mod_4 == 0) { w3[1] = w3[2]; w3[2] = w3[3]; w3[3] = c0[0]; c0[0] = c0[1]; c0[1] = c0[2]; c0[2] = c0[3]; c0[3] = c1[0]; c1[0] = c1[1]; c1[1] = c1[2]; c1[2] = c1[3]; c1[3] = c2[0]; c2[0] = c2[1]; c2[1] = c2[2]; c2[2] = c2[3]; c2[3] = c3[0]; c3[0] = c3[1]; c3[1] = 0; } break; case 14: c3[2] = amd_bytealign ( 0, w3[3], offset_minus_4); c3[1] = amd_bytealign (w3[3], w3[2], offset_minus_4); c3[0] = amd_bytealign (w3[2], w3[1], offset_minus_4); c2[3] = amd_bytealign (w3[1], w3[0], offset_minus_4); c2[2] = amd_bytealign (w3[0], w2[3], offset_minus_4); c2[1] = amd_bytealign (w2[3], w2[2], offset_minus_4); c2[0] = amd_bytealign (w2[2], w2[1], offset_minus_4); c1[3] = amd_bytealign (w2[1], w2[0], offset_minus_4); c1[2] = amd_bytealign (w2[0], w1[3], offset_minus_4); c1[1] = amd_bytealign (w1[3], w1[2], offset_minus_4); c1[0] = amd_bytealign (w1[2], w1[1], offset_minus_4); c0[3] = amd_bytealign (w1[1], w1[0], offset_minus_4); c0[2] = amd_bytealign (w1[0], w0[3], offset_minus_4); c0[1] = amd_bytealign (w0[3], w0[2], offset_minus_4); c0[0] = amd_bytealign (w0[2], w0[1], offset_minus_4); w3[3] = amd_bytealign (w0[1], w0[0], offset_minus_4); w3[2] = amd_bytealign (w0[0], 0, offset_minus_4); w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; if (offset_mod_4 == 0) { w3[2] = w3[3]; w3[3] = c0[0]; c0[0] = c0[1]; c0[1] = c0[2]; c0[2] = c0[3]; c0[3] = c1[0]; c1[0] = c1[1]; c1[1] = c1[2]; c1[2] = c1[3]; c1[3] = c2[0]; c2[0] = c2[1]; c2[1] = c2[2]; c2[2] = c2[3]; c2[3] = c3[0]; c3[0] = c3[1]; c3[1] = c3[2]; c3[2] = 0; } break; case 15: c3[3] = amd_bytealign ( 0, w3[3], offset_minus_4); c3[2] = amd_bytealign (w3[3], w3[2], offset_minus_4); c3[1] = amd_bytealign (w3[2], w3[1], offset_minus_4); c3[0] = amd_bytealign (w3[1], w3[0], offset_minus_4); c2[3] = amd_bytealign (w3[0], w2[3], offset_minus_4); c2[2] = amd_bytealign (w2[3], w2[2], offset_minus_4); c2[1] = amd_bytealign (w2[2], w2[1], offset_minus_4); c2[0] = amd_bytealign (w2[1], w2[0], offset_minus_4); c1[3] = amd_bytealign (w2[0], w1[3], offset_minus_4); c1[2] = amd_bytealign (w1[3], w1[2], offset_minus_4); c1[1] = amd_bytealign (w1[2], w1[1], offset_minus_4); c1[0] = amd_bytealign (w1[1], w1[0], offset_minus_4); c0[3] = amd_bytealign (w1[0], w0[3], offset_minus_4); c0[2] = amd_bytealign (w0[3], w0[2], offset_minus_4); c0[1] = amd_bytealign (w0[2], w0[1], offset_minus_4); c0[0] = amd_bytealign (w0[1], w0[0], offset_minus_4); w3[3] = amd_bytealign (w0[0], 0, offset_minus_4); w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; if (offset_mod_4 == 0) { w3[3] = c0[0]; c0[0] = c0[1]; c0[1] = c0[2]; c0[2] = c0[3]; c0[3] = c1[0]; c1[0] = c1[1]; c1[1] = c1[2]; c1[2] = c1[3]; c1[3] = c2[0]; c2[0] = c2[1]; c2[1] = c2[2]; c2[2] = c2[3]; c2[3] = c3[0]; c3[0] = c3[1]; c3[1] = c3[2]; c3[2] = c3[3]; c3[3] = 0; } break; } #endif } DECLSPEC void switch_buffer_by_offset_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32 offset) { const int offset_switch = offset / 4; #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC switch (offset_switch) { case 0: w3[3] = amd_bytealign (w3[2], w3[3], offset); w3[2] = amd_bytealign (w3[1], w3[2], offset); w3[1] = amd_bytealign (w3[0], w3[1], offset); w3[0] = amd_bytealign (w2[3], w3[0], offset); w2[3] = amd_bytealign (w2[2], w2[3], offset); w2[2] = amd_bytealign (w2[1], w2[2], offset); w2[1] = amd_bytealign (w2[0], w2[1], offset); w2[0] = amd_bytealign (w1[3], w2[0], offset); w1[3] = amd_bytealign (w1[2], w1[3], offset); w1[2] = amd_bytealign (w1[1], w1[2], offset); w1[1] = amd_bytealign (w1[0], w1[1], offset); w1[0] = amd_bytealign (w0[3], w1[0], offset); w0[3] = amd_bytealign (w0[2], w0[3], offset); w0[2] = amd_bytealign (w0[1], w0[2], offset); w0[1] = amd_bytealign (w0[0], w0[1], offset); w0[0] = amd_bytealign ( 0, w0[0], offset); break; case 1: w3[3] = amd_bytealign (w3[1], w3[2], offset); w3[2] = amd_bytealign (w3[0], w3[1], offset); w3[1] = amd_bytealign (w2[3], w3[0], offset); w3[0] = amd_bytealign (w2[2], w2[3], offset); w2[3] = amd_bytealign (w2[1], w2[2], offset); w2[2] = amd_bytealign (w2[0], w2[1], offset); w2[1] = amd_bytealign (w1[3], w2[0], offset); w2[0] = amd_bytealign (w1[2], w1[3], offset); w1[3] = amd_bytealign (w1[1], w1[2], offset); w1[2] = amd_bytealign (w1[0], w1[1], offset); w1[1] = amd_bytealign (w0[3], w1[0], offset); w1[0] = amd_bytealign (w0[2], w0[3], offset); w0[3] = amd_bytealign (w0[1], w0[2], offset); w0[2] = amd_bytealign (w0[0], w0[1], offset); w0[1] = amd_bytealign ( 0, w0[0], offset); w0[0] = 0; break; case 2: w3[3] = amd_bytealign (w3[0], w3[1], offset); w3[2] = amd_bytealign (w2[3], w3[0], offset); w3[1] = amd_bytealign (w2[2], w2[3], offset); w3[0] = amd_bytealign (w2[1], w2[2], offset); w2[3] = amd_bytealign (w2[0], w2[1], offset); w2[2] = amd_bytealign (w1[3], w2[0], offset); w2[1] = amd_bytealign (w1[2], w1[3], offset); w2[0] = amd_bytealign (w1[1], w1[2], offset); w1[3] = amd_bytealign (w1[0], w1[1], offset); w1[2] = amd_bytealign (w0[3], w1[0], offset); w1[1] = amd_bytealign (w0[2], w0[3], offset); w1[0] = amd_bytealign (w0[1], w0[2], offset); w0[3] = amd_bytealign (w0[0], w0[1], offset); w0[2] = amd_bytealign ( 0, w0[0], offset); w0[1] = 0; w0[0] = 0; break; case 3: w3[3] = amd_bytealign (w2[3], w3[0], offset); w3[2] = amd_bytealign (w2[2], w2[3], offset); w3[1] = amd_bytealign (w2[1], w2[2], offset); w3[0] = amd_bytealign (w2[0], w2[1], offset); w2[3] = amd_bytealign (w1[3], w2[0], offset); w2[2] = amd_bytealign (w1[2], w1[3], offset); w2[1] = amd_bytealign (w1[1], w1[2], offset); w2[0] = amd_bytealign (w1[0], w1[1], offset); w1[3] = amd_bytealign (w0[3], w1[0], offset); w1[2] = amd_bytealign (w0[2], w0[3], offset); w1[1] = amd_bytealign (w0[1], w0[2], offset); w1[0] = amd_bytealign (w0[0], w0[1], offset); w0[3] = amd_bytealign ( 0, w0[0], offset); w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 4: w3[3] = amd_bytealign (w2[2], w2[3], offset); w3[2] = amd_bytealign (w2[1], w2[2], offset); w3[1] = amd_bytealign (w2[0], w2[1], offset); w3[0] = amd_bytealign (w1[3], w2[0], offset); w2[3] = amd_bytealign (w1[2], w1[3], offset); w2[2] = amd_bytealign (w1[1], w1[2], offset); w2[1] = amd_bytealign (w1[0], w1[1], offset); w2[0] = amd_bytealign (w0[3], w1[0], offset); w1[3] = amd_bytealign (w0[2], w0[3], offset); w1[2] = amd_bytealign (w0[1], w0[2], offset); w1[1] = amd_bytealign (w0[0], w0[1], offset); w1[0] = amd_bytealign ( 0, w0[0], offset); w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 5: w3[3] = amd_bytealign (w2[1], w2[2], offset); w3[2] = amd_bytealign (w2[0], w2[1], offset); w3[1] = amd_bytealign (w1[3], w2[0], offset); w3[0] = amd_bytealign (w1[2], w1[3], offset); w2[3] = amd_bytealign (w1[1], w1[2], offset); w2[2] = amd_bytealign (w1[0], w1[1], offset); w2[1] = amd_bytealign (w0[3], w1[0], offset); w2[0] = amd_bytealign (w0[2], w0[3], offset); w1[3] = amd_bytealign (w0[1], w0[2], offset); w1[2] = amd_bytealign (w0[0], w0[1], offset); w1[1] = amd_bytealign ( 0, w0[0], offset); w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 6: w3[3] = amd_bytealign (w2[0], w2[1], offset); w3[2] = amd_bytealign (w1[3], w2[0], offset); w3[1] = amd_bytealign (w1[2], w1[3], offset); w3[0] = amd_bytealign (w1[1], w1[2], offset); w2[3] = amd_bytealign (w1[0], w1[1], offset); w2[2] = amd_bytealign (w0[3], w1[0], offset); w2[1] = amd_bytealign (w0[2], w0[3], offset); w2[0] = amd_bytealign (w0[1], w0[2], offset); w1[3] = amd_bytealign (w0[0], w0[1], offset); w1[2] = amd_bytealign ( 0, w0[0], offset); w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 7: w3[3] = amd_bytealign (w1[3], w2[0], offset); w3[2] = amd_bytealign (w1[2], w1[3], offset); w3[1] = amd_bytealign (w1[1], w1[2], offset); w3[0] = amd_bytealign (w1[0], w1[1], offset); w2[3] = amd_bytealign (w0[3], w1[0], offset); w2[2] = amd_bytealign (w0[2], w0[3], offset); w2[1] = amd_bytealign (w0[1], w0[2], offset); w2[0] = amd_bytealign (w0[0], w0[1], offset); w1[3] = amd_bytealign ( 0, w0[0], offset); w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 8: w3[3] = amd_bytealign (w1[2], w1[3], offset); w3[2] = amd_bytealign (w1[1], w1[2], offset); w3[1] = amd_bytealign (w1[0], w1[1], offset); w3[0] = amd_bytealign (w0[3], w1[0], offset); w2[3] = amd_bytealign (w0[2], w0[3], offset); w2[2] = amd_bytealign (w0[1], w0[2], offset); w2[1] = amd_bytealign (w0[0], w0[1], offset); w2[0] = amd_bytealign ( 0, w0[0], offset); w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 9: w3[3] = amd_bytealign (w1[1], w1[2], offset); w3[2] = amd_bytealign (w1[0], w1[1], offset); w3[1] = amd_bytealign (w0[3], w1[0], offset); w3[0] = amd_bytealign (w0[2], w0[3], offset); w2[3] = amd_bytealign (w0[1], w0[2], offset); w2[2] = amd_bytealign (w0[0], w0[1], offset); w2[1] = amd_bytealign ( 0, w0[0], offset); w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 10: w3[3] = amd_bytealign (w1[0], w1[1], offset); w3[2] = amd_bytealign (w0[3], w1[0], offset); w3[1] = amd_bytealign (w0[2], w0[3], offset); w3[0] = amd_bytealign (w0[1], w0[2], offset); w2[3] = amd_bytealign (w0[0], w0[1], offset); w2[2] = amd_bytealign ( 0, w0[0], offset); w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 11: w3[3] = amd_bytealign (w0[3], w1[0], offset); w3[2] = amd_bytealign (w0[2], w0[3], offset); w3[1] = amd_bytealign (w0[1], w0[2], offset); w3[0] = amd_bytealign (w0[0], w0[1], offset); w2[3] = amd_bytealign ( 0, w0[0], offset); w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 12: w3[3] = amd_bytealign (w0[2], w0[3], offset); w3[2] = amd_bytealign (w0[1], w0[2], offset); w3[1] = amd_bytealign (w0[0], w0[1], offset); w3[0] = amd_bytealign ( 0, w0[0], offset); w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 13: w3[3] = amd_bytealign (w0[1], w0[2], offset); w3[2] = amd_bytealign (w0[0], w0[1], offset); w3[1] = amd_bytealign ( 0, w0[0], offset); w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 14: w3[3] = amd_bytealign (w0[0], w0[1], offset); w3[2] = amd_bytealign ( 0, w0[0], offset); w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 15: w3[3] = amd_bytealign ( 0, w0[0], offset); w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; } #endif #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV #if defined IS_NV const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; #endif #if defined IS_AMD const int selector = 0x0706050403020100 >> ((offset & 3) * 8); #endif switch (offset_switch) { case 0: w3[3] = __byte_perm (w3[3], w3[2], selector); w3[2] = __byte_perm (w3[2], w3[1], selector); w3[1] = __byte_perm (w3[1], w3[0], selector); w3[0] = __byte_perm (w3[0], w2[3], selector); w2[3] = __byte_perm (w2[3], w2[2], selector); w2[2] = __byte_perm (w2[2], w2[1], selector); w2[1] = __byte_perm (w2[1], w2[0], selector); w2[0] = __byte_perm (w2[0], w1[3], selector); w1[3] = __byte_perm (w1[3], w1[2], selector); w1[2] = __byte_perm (w1[2], w1[1], selector); w1[1] = __byte_perm (w1[1], w1[0], selector); w1[0] = __byte_perm (w1[0], w0[3], selector); w0[3] = __byte_perm (w0[3], w0[2], selector); w0[2] = __byte_perm (w0[2], w0[1], selector); w0[1] = __byte_perm (w0[1], w0[0], selector); w0[0] = __byte_perm (w0[0], 0, selector); break; case 1: w3[3] = __byte_perm (w3[2], w3[1], selector); w3[2] = __byte_perm (w3[1], w3[0], selector); w3[1] = __byte_perm (w3[0], w2[3], selector); w3[0] = __byte_perm (w2[3], w2[2], selector); w2[3] = __byte_perm (w2[2], w2[1], selector); w2[2] = __byte_perm (w2[1], w2[0], selector); w2[1] = __byte_perm (w2[0], w1[3], selector); w2[0] = __byte_perm (w1[3], w1[2], selector); w1[3] = __byte_perm (w1[2], w1[1], selector); w1[2] = __byte_perm (w1[1], w1[0], selector); w1[1] = __byte_perm (w1[0], w0[3], selector); w1[0] = __byte_perm (w0[3], w0[2], selector); w0[3] = __byte_perm (w0[2], w0[1], selector); w0[2] = __byte_perm (w0[1], w0[0], selector); w0[1] = __byte_perm (w0[0], 0, selector); w0[0] = 0; break; case 2: w3[3] = __byte_perm (w3[1], w3[0], selector); w3[2] = __byte_perm (w3[0], w2[3], selector); w3[1] = __byte_perm (w2[3], w2[2], selector); w3[0] = __byte_perm (w2[2], w2[1], selector); w2[3] = __byte_perm (w2[1], w2[0], selector); w2[2] = __byte_perm (w2[0], w1[3], selector); w2[1] = __byte_perm (w1[3], w1[2], selector); w2[0] = __byte_perm (w1[2], w1[1], selector); w1[3] = __byte_perm (w1[1], w1[0], selector); w1[2] = __byte_perm (w1[0], w0[3], selector); w1[1] = __byte_perm (w0[3], w0[2], selector); w1[0] = __byte_perm (w0[2], w0[1], selector); w0[3] = __byte_perm (w0[1], w0[0], selector); w0[2] = __byte_perm (w0[0], 0, selector); w0[1] = 0; w0[0] = 0; break; case 3: w3[3] = __byte_perm (w3[0], w2[3], selector); w3[2] = __byte_perm (w2[3], w2[2], selector); w3[1] = __byte_perm (w2[2], w2[1], selector); w3[0] = __byte_perm (w2[1], w2[0], selector); w2[3] = __byte_perm (w2[0], w1[3], selector); w2[2] = __byte_perm (w1[3], w1[2], selector); w2[1] = __byte_perm (w1[2], w1[1], selector); w2[0] = __byte_perm (w1[1], w1[0], selector); w1[3] = __byte_perm (w1[0], w0[3], selector); w1[2] = __byte_perm (w0[3], w0[2], selector); w1[1] = __byte_perm (w0[2], w0[1], selector); w1[0] = __byte_perm (w0[1], w0[0], selector); w0[3] = __byte_perm (w0[0], 0, selector); w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 4: w3[3] = __byte_perm (w2[3], w2[2], selector); w3[2] = __byte_perm (w2[2], w2[1], selector); w3[1] = __byte_perm (w2[1], w2[0], selector); w3[0] = __byte_perm (w2[0], w1[3], selector); w2[3] = __byte_perm (w1[3], w1[2], selector); w2[2] = __byte_perm (w1[2], w1[1], selector); w2[1] = __byte_perm (w1[1], w1[0], selector); w2[0] = __byte_perm (w1[0], w0[3], selector); w1[3] = __byte_perm (w0[3], w0[2], selector); w1[2] = __byte_perm (w0[2], w0[1], selector); w1[1] = __byte_perm (w0[1], w0[0], selector); w1[0] = __byte_perm (w0[0], 0, selector); w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 5: w3[3] = __byte_perm (w2[2], w2[1], selector); w3[2] = __byte_perm (w2[1], w2[0], selector); w3[1] = __byte_perm (w2[0], w1[3], selector); w3[0] = __byte_perm (w1[3], w1[2], selector); w2[3] = __byte_perm (w1[2], w1[1], selector); w2[2] = __byte_perm (w1[1], w1[0], selector); w2[1] = __byte_perm (w1[0], w0[3], selector); w2[0] = __byte_perm (w0[3], w0[2], selector); w1[3] = __byte_perm (w0[2], w0[1], selector); w1[2] = __byte_perm (w0[1], w0[0], selector); w1[1] = __byte_perm (w0[0], 0, selector); w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 6: w3[3] = __byte_perm (w2[1], w2[0], selector); w3[2] = __byte_perm (w2[0], w1[3], selector); w3[1] = __byte_perm (w1[3], w1[2], selector); w3[0] = __byte_perm (w1[2], w1[1], selector); w2[3] = __byte_perm (w1[1], w1[0], selector); w2[2] = __byte_perm (w1[0], w0[3], selector); w2[1] = __byte_perm (w0[3], w0[2], selector); w2[0] = __byte_perm (w0[2], w0[1], selector); w1[3] = __byte_perm (w0[1], w0[0], selector); w1[2] = __byte_perm (w0[0], 0, selector); w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 7: w3[3] = __byte_perm (w2[0], w1[3], selector); w3[2] = __byte_perm (w1[3], w1[2], selector); w3[1] = __byte_perm (w1[2], w1[1], selector); w3[0] = __byte_perm (w1[1], w1[0], selector); w2[3] = __byte_perm (w1[0], w0[3], selector); w2[2] = __byte_perm (w0[3], w0[2], selector); w2[1] = __byte_perm (w0[2], w0[1], selector); w2[0] = __byte_perm (w0[1], w0[0], selector); w1[3] = __byte_perm (w0[0], 0, selector); w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 8: w3[3] = __byte_perm (w1[3], w1[2], selector); w3[2] = __byte_perm (w1[2], w1[1], selector); w3[1] = __byte_perm (w1[1], w1[0], selector); w3[0] = __byte_perm (w1[0], w0[3], selector); w2[3] = __byte_perm (w0[3], w0[2], selector); w2[2] = __byte_perm (w0[2], w0[1], selector); w2[1] = __byte_perm (w0[1], w0[0], selector); w2[0] = __byte_perm (w0[0], 0, selector); w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 9: w3[3] = __byte_perm (w1[2], w1[1], selector); w3[2] = __byte_perm (w1[1], w1[0], selector); w3[1] = __byte_perm (w1[0], w0[3], selector); w3[0] = __byte_perm (w0[3], w0[2], selector); w2[3] = __byte_perm (w0[2], w0[1], selector); w2[2] = __byte_perm (w0[1], w0[0], selector); w2[1] = __byte_perm (w0[0], 0, selector); w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 10: w3[3] = __byte_perm (w1[1], w1[0], selector); w3[2] = __byte_perm (w1[0], w0[3], selector); w3[1] = __byte_perm (w0[3], w0[2], selector); w3[0] = __byte_perm (w0[2], w0[1], selector); w2[3] = __byte_perm (w0[1], w0[0], selector); w2[2] = __byte_perm (w0[0], 0, selector); w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 11: w3[3] = __byte_perm (w1[0], w0[3], selector); w3[2] = __byte_perm (w0[3], w0[2], selector); w3[1] = __byte_perm (w0[2], w0[1], selector); w3[0] = __byte_perm (w0[1], w0[0], selector); w2[3] = __byte_perm (w0[0], 0, selector); w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 12: w3[3] = __byte_perm (w0[3], w0[2], selector); w3[2] = __byte_perm (w0[2], w0[1], selector); w3[1] = __byte_perm (w0[1], w0[0], selector); w3[0] = __byte_perm (w0[0], 0, selector); w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 13: w3[3] = __byte_perm (w0[2], w0[1], selector); w3[2] = __byte_perm (w0[1], w0[0], selector); w3[1] = __byte_perm (w0[0], 0, selector); w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 14: w3[3] = __byte_perm (w0[1], w0[0], selector); w3[2] = __byte_perm (w0[0], 0, selector); w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 15: w3[3] = __byte_perm (w0[0], 0, selector); w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; } #endif } DECLSPEC void switch_buffer_by_offset_carry_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x c0[4], u32x c1[4], u32x c2[4], u32x c3[4], const u32 offset) { const int offset_switch = offset / 4; #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC switch (offset_switch) { case 0: c0[0] = amd_bytealign (w3[3], 0, offset); w3[3] = amd_bytealign (w3[2], w3[3], offset); w3[2] = amd_bytealign (w3[1], w3[2], offset); w3[1] = amd_bytealign (w3[0], w3[1], offset); w3[0] = amd_bytealign (w2[3], w3[0], offset); w2[3] = amd_bytealign (w2[2], w2[3], offset); w2[2] = amd_bytealign (w2[1], w2[2], offset); w2[1] = amd_bytealign (w2[0], w2[1], offset); w2[0] = amd_bytealign (w1[3], w2[0], offset); w1[3] = amd_bytealign (w1[2], w1[3], offset); w1[2] = amd_bytealign (w1[1], w1[2], offset); w1[1] = amd_bytealign (w1[0], w1[1], offset); w1[0] = amd_bytealign (w0[3], w1[0], offset); w0[3] = amd_bytealign (w0[2], w0[3], offset); w0[2] = amd_bytealign (w0[1], w0[2], offset); w0[1] = amd_bytealign (w0[0], w0[1], offset); w0[0] = amd_bytealign ( 0, w0[0], offset); break; case 1: c0[1] = amd_bytealign (w3[3], 0, offset); c0[0] = amd_bytealign (w3[2], w3[3], offset); w3[3] = amd_bytealign (w3[1], w3[2], offset); w3[2] = amd_bytealign (w3[0], w3[1], offset); w3[1] = amd_bytealign (w2[3], w3[0], offset); w3[0] = amd_bytealign (w2[2], w2[3], offset); w2[3] = amd_bytealign (w2[1], w2[2], offset); w2[2] = amd_bytealign (w2[0], w2[1], offset); w2[1] = amd_bytealign (w1[3], w2[0], offset); w2[0] = amd_bytealign (w1[2], w1[3], offset); w1[3] = amd_bytealign (w1[1], w1[2], offset); w1[2] = amd_bytealign (w1[0], w1[1], offset); w1[1] = amd_bytealign (w0[3], w1[0], offset); w1[0] = amd_bytealign (w0[2], w0[3], offset); w0[3] = amd_bytealign (w0[1], w0[2], offset); w0[2] = amd_bytealign (w0[0], w0[1], offset); w0[1] = amd_bytealign ( 0, w0[0], offset); w0[0] = 0; break; case 2: c0[2] = amd_bytealign (w3[3], 0, offset); c0[1] = amd_bytealign (w3[2], w3[3], offset); c0[0] = amd_bytealign (w3[1], w3[2], offset); w3[3] = amd_bytealign (w3[0], w3[1], offset); w3[2] = amd_bytealign (w2[3], w3[0], offset); w3[1] = amd_bytealign (w2[2], w2[3], offset); w3[0] = amd_bytealign (w2[1], w2[2], offset); w2[3] = amd_bytealign (w2[0], w2[1], offset); w2[2] = amd_bytealign (w1[3], w2[0], offset); w2[1] = amd_bytealign (w1[2], w1[3], offset); w2[0] = amd_bytealign (w1[1], w1[2], offset); w1[3] = amd_bytealign (w1[0], w1[1], offset); w1[2] = amd_bytealign (w0[3], w1[0], offset); w1[1] = amd_bytealign (w0[2], w0[3], offset); w1[0] = amd_bytealign (w0[1], w0[2], offset); w0[3] = amd_bytealign (w0[0], w0[1], offset); w0[2] = amd_bytealign ( 0, w0[0], offset); w0[1] = 0; w0[0] = 0; break; case 3: c0[3] = amd_bytealign (w3[3], 0, offset); c0[2] = amd_bytealign (w3[2], w3[3], offset); c0[1] = amd_bytealign (w3[1], w3[2], offset); c0[0] = amd_bytealign (w3[0], w3[1], offset); w3[3] = amd_bytealign (w2[3], w3[0], offset); w3[2] = amd_bytealign (w2[2], w2[3], offset); w3[1] = amd_bytealign (w2[1], w2[2], offset); w3[0] = amd_bytealign (w2[0], w2[1], offset); w2[3] = amd_bytealign (w1[3], w2[0], offset); w2[2] = amd_bytealign (w1[2], w1[3], offset); w2[1] = amd_bytealign (w1[1], w1[2], offset); w2[0] = amd_bytealign (w1[0], w1[1], offset); w1[3] = amd_bytealign (w0[3], w1[0], offset); w1[2] = amd_bytealign (w0[2], w0[3], offset); w1[1] = amd_bytealign (w0[1], w0[2], offset); w1[0] = amd_bytealign (w0[0], w0[1], offset); w0[3] = amd_bytealign ( 0, w0[0], offset); w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 4: c1[0] = amd_bytealign (w3[3], 0, offset); c0[3] = amd_bytealign (w3[2], w3[3], offset); c0[2] = amd_bytealign (w3[1], w3[2], offset); c0[1] = amd_bytealign (w3[0], w3[1], offset); c0[0] = amd_bytealign (w2[3], w3[0], offset); w3[3] = amd_bytealign (w2[2], w2[3], offset); w3[2] = amd_bytealign (w2[1], w2[2], offset); w3[1] = amd_bytealign (w2[0], w2[1], offset); w3[0] = amd_bytealign (w1[3], w2[0], offset); w2[3] = amd_bytealign (w1[2], w1[3], offset); w2[2] = amd_bytealign (w1[1], w1[2], offset); w2[1] = amd_bytealign (w1[0], w1[1], offset); w2[0] = amd_bytealign (w0[3], w1[0], offset); w1[3] = amd_bytealign (w0[2], w0[3], offset); w1[2] = amd_bytealign (w0[1], w0[2], offset); w1[1] = amd_bytealign (w0[0], w0[1], offset); w1[0] = amd_bytealign ( 0, w0[0], offset); w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 5: c1[1] = amd_bytealign (w3[3], 0, offset); c1[0] = amd_bytealign (w3[2], w3[3], offset); c0[3] = amd_bytealign (w3[1], w3[2], offset); c0[2] = amd_bytealign (w3[0], w3[1], offset); c0[1] = amd_bytealign (w2[3], w3[0], offset); c0[0] = amd_bytealign (w2[2], w2[3], offset); w3[3] = amd_bytealign (w2[1], w2[2], offset); w3[2] = amd_bytealign (w2[0], w2[1], offset); w3[1] = amd_bytealign (w1[3], w2[0], offset); w3[0] = amd_bytealign (w1[2], w1[3], offset); w2[3] = amd_bytealign (w1[1], w1[2], offset); w2[2] = amd_bytealign (w1[0], w1[1], offset); w2[1] = amd_bytealign (w0[3], w1[0], offset); w2[0] = amd_bytealign (w0[2], w0[3], offset); w1[3] = amd_bytealign (w0[1], w0[2], offset); w1[2] = amd_bytealign (w0[0], w0[1], offset); w1[1] = amd_bytealign ( 0, w0[0], offset); w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 6: c1[2] = amd_bytealign (w3[3], 0, offset); c1[1] = amd_bytealign (w3[2], w3[3], offset); c1[0] = amd_bytealign (w3[1], w3[2], offset); c0[3] = amd_bytealign (w3[0], w3[1], offset); c0[2] = amd_bytealign (w2[3], w3[0], offset); c0[1] = amd_bytealign (w2[2], w2[3], offset); c0[0] = amd_bytealign (w2[1], w2[2], offset); w3[3] = amd_bytealign (w2[0], w2[1], offset); w3[2] = amd_bytealign (w1[3], w2[0], offset); w3[1] = amd_bytealign (w1[2], w1[3], offset); w3[0] = amd_bytealign (w1[1], w1[2], offset); w2[3] = amd_bytealign (w1[0], w1[1], offset); w2[2] = amd_bytealign (w0[3], w1[0], offset); w2[1] = amd_bytealign (w0[2], w0[3], offset); w2[0] = amd_bytealign (w0[1], w0[2], offset); w1[3] = amd_bytealign (w0[0], w0[1], offset); w1[2] = amd_bytealign ( 0, w0[0], offset); w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 7: c1[3] = amd_bytealign (w3[3], 0, offset); c1[2] = amd_bytealign (w3[2], w3[3], offset); c1[1] = amd_bytealign (w3[1], w3[2], offset); c1[0] = amd_bytealign (w3[0], w3[1], offset); c0[3] = amd_bytealign (w2[3], w3[0], offset); c0[2] = amd_bytealign (w2[2], w2[3], offset); c0[1] = amd_bytealign (w2[1], w2[2], offset); c0[0] = amd_bytealign (w2[0], w2[1], offset); w3[3] = amd_bytealign (w1[3], w2[0], offset); w3[2] = amd_bytealign (w1[2], w1[3], offset); w3[1] = amd_bytealign (w1[1], w1[2], offset); w3[0] = amd_bytealign (w1[0], w1[1], offset); w2[3] = amd_bytealign (w0[3], w1[0], offset); w2[2] = amd_bytealign (w0[2], w0[3], offset); w2[1] = amd_bytealign (w0[1], w0[2], offset); w2[0] = amd_bytealign (w0[0], w0[1], offset); w1[3] = amd_bytealign ( 0, w0[0], offset); w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 8: c2[0] = amd_bytealign (w3[3], 0, offset); c1[3] = amd_bytealign (w3[2], w3[3], offset); c1[2] = amd_bytealign (w3[1], w3[2], offset); c1[1] = amd_bytealign (w3[0], w3[1], offset); c1[0] = amd_bytealign (w2[3], w3[0], offset); c0[3] = amd_bytealign (w2[2], w2[3], offset); c0[2] = amd_bytealign (w2[1], w2[2], offset); c0[1] = amd_bytealign (w2[0], w2[1], offset); c0[0] = amd_bytealign (w1[3], w2[0], offset); w3[3] = amd_bytealign (w1[2], w1[3], offset); w3[2] = amd_bytealign (w1[1], w1[2], offset); w3[1] = amd_bytealign (w1[0], w1[1], offset); w3[0] = amd_bytealign (w0[3], w1[0], offset); w2[3] = amd_bytealign (w0[2], w0[3], offset); w2[2] = amd_bytealign (w0[1], w0[2], offset); w2[1] = amd_bytealign (w0[0], w0[1], offset); w2[0] = amd_bytealign ( 0, w0[0], offset); w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 9: c2[1] = amd_bytealign (w3[3], 0, offset); c2[0] = amd_bytealign (w3[2], w3[3], offset); c1[3] = amd_bytealign (w3[1], w3[2], offset); c1[2] = amd_bytealign (w3[0], w3[1], offset); c1[1] = amd_bytealign (w2[3], w3[0], offset); c1[0] = amd_bytealign (w2[2], w2[3], offset); c0[3] = amd_bytealign (w2[1], w2[2], offset); c0[2] = amd_bytealign (w2[0], w2[1], offset); c0[1] = amd_bytealign (w1[3], w2[0], offset); c0[0] = amd_bytealign (w1[2], w1[3], offset); w3[3] = amd_bytealign (w1[1], w1[2], offset); w3[2] = amd_bytealign (w1[0], w1[1], offset); w3[1] = amd_bytealign (w0[3], w1[0], offset); w3[0] = amd_bytealign (w0[2], w0[3], offset); w2[3] = amd_bytealign (w0[1], w0[2], offset); w2[2] = amd_bytealign (w0[0], w0[1], offset); w2[1] = amd_bytealign ( 0, w0[0], offset); w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 10: c2[2] = amd_bytealign (w3[3], 0, offset); c2[1] = amd_bytealign (w3[2], w3[3], offset); c2[0] = amd_bytealign (w3[1], w3[2], offset); c1[3] = amd_bytealign (w3[0], w3[1], offset); c1[2] = amd_bytealign (w2[3], w3[0], offset); c1[1] = amd_bytealign (w2[2], w2[3], offset); c1[0] = amd_bytealign (w2[1], w2[2], offset); c0[3] = amd_bytealign (w2[0], w2[1], offset); c0[2] = amd_bytealign (w1[3], w2[0], offset); c0[1] = amd_bytealign (w1[2], w1[3], offset); c0[0] = amd_bytealign (w1[1], w1[2], offset); w3[3] = amd_bytealign (w1[0], w1[1], offset); w3[2] = amd_bytealign (w0[3], w1[0], offset); w3[1] = amd_bytealign (w0[2], w0[3], offset); w3[0] = amd_bytealign (w0[1], w0[2], offset); w2[3] = amd_bytealign (w0[0], w0[1], offset); w2[2] = amd_bytealign ( 0, w0[0], offset); w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 11: c2[3] = amd_bytealign (w3[3], 0, offset); c2[2] = amd_bytealign (w3[2], w3[3], offset); c2[1] = amd_bytealign (w3[1], w3[2], offset); c2[0] = amd_bytealign (w3[0], w3[1], offset); c1[3] = amd_bytealign (w2[3], w3[0], offset); c1[2] = amd_bytealign (w2[2], w2[3], offset); c1[1] = amd_bytealign (w2[1], w2[2], offset); c1[0] = amd_bytealign (w2[0], w2[1], offset); c0[3] = amd_bytealign (w1[3], w2[0], offset); c0[2] = amd_bytealign (w1[2], w1[3], offset); c0[1] = amd_bytealign (w1[1], w1[2], offset); c0[0] = amd_bytealign (w1[0], w1[1], offset); w3[3] = amd_bytealign (w0[3], w1[0], offset); w3[2] = amd_bytealign (w0[2], w0[3], offset); w3[1] = amd_bytealign (w0[1], w0[2], offset); w3[0] = amd_bytealign (w0[0], w0[1], offset); w2[3] = amd_bytealign ( 0, w0[0], offset); w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 12: c3[0] = amd_bytealign (w3[3], 0, offset); c2[3] = amd_bytealign (w3[2], w3[3], offset); c2[2] = amd_bytealign (w3[1], w3[2], offset); c2[1] = amd_bytealign (w3[0], w3[1], offset); c2[0] = amd_bytealign (w2[3], w3[0], offset); c1[3] = amd_bytealign (w2[2], w2[3], offset); c1[2] = amd_bytealign (w2[1], w2[2], offset); c1[1] = amd_bytealign (w2[0], w2[1], offset); c1[0] = amd_bytealign (w1[3], w2[0], offset); c0[3] = amd_bytealign (w1[2], w1[3], offset); c0[2] = amd_bytealign (w1[1], w1[2], offset); c0[1] = amd_bytealign (w1[0], w1[1], offset); c0[0] = amd_bytealign (w0[3], w1[0], offset); w3[3] = amd_bytealign (w0[2], w0[3], offset); w3[2] = amd_bytealign (w0[1], w0[2], offset); w3[1] = amd_bytealign (w0[0], w0[1], offset); w3[0] = amd_bytealign ( 0, w0[0], offset); w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 13: c3[1] = amd_bytealign (w3[3], 0, offset); c3[0] = amd_bytealign (w3[2], w3[3], offset); c2[3] = amd_bytealign (w3[1], w3[2], offset); c2[2] = amd_bytealign (w3[0], w3[1], offset); c2[1] = amd_bytealign (w2[3], w3[0], offset); c2[0] = amd_bytealign (w2[2], w2[3], offset); c1[3] = amd_bytealign (w2[1], w2[2], offset); c1[2] = amd_bytealign (w2[0], w2[1], offset); c1[1] = amd_bytealign (w1[3], w2[0], offset); c1[0] = amd_bytealign (w1[2], w1[3], offset); c0[3] = amd_bytealign (w1[1], w1[2], offset); c0[2] = amd_bytealign (w1[0], w1[1], offset); c0[1] = amd_bytealign (w0[3], w1[0], offset); c0[0] = amd_bytealign (w0[2], w0[3], offset); w3[3] = amd_bytealign (w0[1], w0[2], offset); w3[2] = amd_bytealign (w0[0], w0[1], offset); w3[1] = amd_bytealign ( 0, w0[0], offset); w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 14: c3[2] = amd_bytealign (w3[3], 0, offset); c3[1] = amd_bytealign (w3[2], w3[3], offset); c3[0] = amd_bytealign (w3[1], w3[2], offset); c2[3] = amd_bytealign (w3[0], w3[1], offset); c2[2] = amd_bytealign (w2[3], w3[0], offset); c2[1] = amd_bytealign (w2[2], w2[3], offset); c2[0] = amd_bytealign (w2[1], w2[2], offset); c1[3] = amd_bytealign (w2[0], w2[1], offset); c1[2] = amd_bytealign (w1[3], w2[0], offset); c1[1] = amd_bytealign (w1[2], w1[3], offset); c1[0] = amd_bytealign (w1[1], w1[2], offset); c0[3] = amd_bytealign (w1[0], w1[1], offset); c0[2] = amd_bytealign (w0[3], w1[0], offset); c0[1] = amd_bytealign (w0[2], w0[3], offset); c0[0] = amd_bytealign (w0[1], w0[2], offset); w3[3] = amd_bytealign (w0[0], w0[1], offset); w3[2] = amd_bytealign ( 0, w0[0], offset); w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 15: c3[3] = amd_bytealign (w3[3], 0, offset); c3[2] = amd_bytealign (w3[2], w3[3], offset); c3[1] = amd_bytealign (w3[1], w3[2], offset); c3[0] = amd_bytealign (w3[0], w3[1], offset); c2[3] = amd_bytealign (w2[3], w3[0], offset); c2[2] = amd_bytealign (w2[2], w2[3], offset); c2[1] = amd_bytealign (w2[1], w2[2], offset); c2[0] = amd_bytealign (w2[0], w2[1], offset); c1[3] = amd_bytealign (w1[3], w2[0], offset); c1[2] = amd_bytealign (w1[2], w1[3], offset); c1[1] = amd_bytealign (w1[1], w1[2], offset); c1[0] = amd_bytealign (w1[0], w1[1], offset); c0[3] = amd_bytealign (w0[3], w1[0], offset); c0[2] = amd_bytealign (w0[2], w0[3], offset); c0[1] = amd_bytealign (w0[1], w0[2], offset); c0[0] = amd_bytealign (w0[0], w0[1], offset); w3[3] = amd_bytealign ( 0, w0[0], offset); w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; } #endif #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV #if defined IS_NV const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; #endif #if defined IS_AMD const int selector = 0x0706050403020100 >> ((offset & 3) * 8); #endif switch (offset_switch) { case 0: c0[0] = __byte_perm ( 0, w3[3], selector); w3[3] = __byte_perm (w3[3], w3[2], selector); w3[2] = __byte_perm (w3[2], w3[1], selector); w3[1] = __byte_perm (w3[1], w3[0], selector); w3[0] = __byte_perm (w3[0], w2[3], selector); w2[3] = __byte_perm (w2[3], w2[2], selector); w2[2] = __byte_perm (w2[2], w2[1], selector); w2[1] = __byte_perm (w2[1], w2[0], selector); w2[0] = __byte_perm (w2[0], w1[3], selector); w1[3] = __byte_perm (w1[3], w1[2], selector); w1[2] = __byte_perm (w1[2], w1[1], selector); w1[1] = __byte_perm (w1[1], w1[0], selector); w1[0] = __byte_perm (w1[0], w0[3], selector); w0[3] = __byte_perm (w0[3], w0[2], selector); w0[2] = __byte_perm (w0[2], w0[1], selector); w0[1] = __byte_perm (w0[1], w0[0], selector); w0[0] = __byte_perm (w0[0], 0, selector); break; case 1: c0[1] = __byte_perm ( 0, w3[3], selector); c0[0] = __byte_perm (w3[3], w3[2], selector); w3[3] = __byte_perm (w3[2], w3[1], selector); w3[2] = __byte_perm (w3[1], w3[0], selector); w3[1] = __byte_perm (w3[0], w2[3], selector); w3[0] = __byte_perm (w2[3], w2[2], selector); w2[3] = __byte_perm (w2[2], w2[1], selector); w2[2] = __byte_perm (w2[1], w2[0], selector); w2[1] = __byte_perm (w2[0], w1[3], selector); w2[0] = __byte_perm (w1[3], w1[2], selector); w1[3] = __byte_perm (w1[2], w1[1], selector); w1[2] = __byte_perm (w1[1], w1[0], selector); w1[1] = __byte_perm (w1[0], w0[3], selector); w1[0] = __byte_perm (w0[3], w0[2], selector); w0[3] = __byte_perm (w0[2], w0[1], selector); w0[2] = __byte_perm (w0[1], w0[0], selector); w0[1] = __byte_perm (w0[0], 0, selector); w0[0] = 0; break; case 2: c0[2] = __byte_perm ( 0, w3[3], selector); c0[1] = __byte_perm (w3[3], w3[2], selector); c0[0] = __byte_perm (w3[2], w3[1], selector); w3[3] = __byte_perm (w3[1], w3[0], selector); w3[2] = __byte_perm (w3[0], w2[3], selector); w3[1] = __byte_perm (w2[3], w2[2], selector); w3[0] = __byte_perm (w2[2], w2[1], selector); w2[3] = __byte_perm (w2[1], w2[0], selector); w2[2] = __byte_perm (w2[0], w1[3], selector); w2[1] = __byte_perm (w1[3], w1[2], selector); w2[0] = __byte_perm (w1[2], w1[1], selector); w1[3] = __byte_perm (w1[1], w1[0], selector); w1[2] = __byte_perm (w1[0], w0[3], selector); w1[1] = __byte_perm (w0[3], w0[2], selector); w1[0] = __byte_perm (w0[2], w0[1], selector); w0[3] = __byte_perm (w0[1], w0[0], selector); w0[2] = __byte_perm (w0[0], 0, selector); w0[1] = 0; w0[0] = 0; break; case 3: c0[3] = __byte_perm ( 0, w3[3], selector); c0[2] = __byte_perm (w3[3], w3[2], selector); c0[1] = __byte_perm (w3[2], w3[1], selector); c0[0] = __byte_perm (w3[1], w3[0], selector); w3[3] = __byte_perm (w3[0], w2[3], selector); w3[2] = __byte_perm (w2[3], w2[2], selector); w3[1] = __byte_perm (w2[2], w2[1], selector); w3[0] = __byte_perm (w2[1], w2[0], selector); w2[3] = __byte_perm (w2[0], w1[3], selector); w2[2] = __byte_perm (w1[3], w1[2], selector); w2[1] = __byte_perm (w1[2], w1[1], selector); w2[0] = __byte_perm (w1[1], w1[0], selector); w1[3] = __byte_perm (w1[0], w0[3], selector); w1[2] = __byte_perm (w0[3], w0[2], selector); w1[1] = __byte_perm (w0[2], w0[1], selector); w1[0] = __byte_perm (w0[1], w0[0], selector); w0[3] = __byte_perm (w0[0], 0, selector); w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 4: c1[0] = __byte_perm ( 0, w3[3], selector); c0[3] = __byte_perm (w3[3], w3[2], selector); c0[2] = __byte_perm (w3[2], w3[1], selector); c0[1] = __byte_perm (w3[1], w3[0], selector); c0[0] = __byte_perm (w3[0], w2[3], selector); w3[3] = __byte_perm (w2[3], w2[2], selector); w3[2] = __byte_perm (w2[2], w2[1], selector); w3[1] = __byte_perm (w2[1], w2[0], selector); w3[0] = __byte_perm (w2[0], w1[3], selector); w2[3] = __byte_perm (w1[3], w1[2], selector); w2[2] = __byte_perm (w1[2], w1[1], selector); w2[1] = __byte_perm (w1[1], w1[0], selector); w2[0] = __byte_perm (w1[0], w0[3], selector); w1[3] = __byte_perm (w0[3], w0[2], selector); w1[2] = __byte_perm (w0[2], w0[1], selector); w1[1] = __byte_perm (w0[1], w0[0], selector); w1[0] = __byte_perm (w0[0], 0, selector); w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 5: c1[1] = __byte_perm ( 0, w3[3], selector); c1[0] = __byte_perm (w3[3], w3[2], selector); c0[3] = __byte_perm (w3[2], w3[1], selector); c0[2] = __byte_perm (w3[1], w3[0], selector); c0[1] = __byte_perm (w3[0], w2[3], selector); c0[0] = __byte_perm (w2[3], w2[2], selector); w3[3] = __byte_perm (w2[2], w2[1], selector); w3[2] = __byte_perm (w2[1], w2[0], selector); w3[1] = __byte_perm (w2[0], w1[3], selector); w3[0] = __byte_perm (w1[3], w1[2], selector); w2[3] = __byte_perm (w1[2], w1[1], selector); w2[2] = __byte_perm (w1[1], w1[0], selector); w2[1] = __byte_perm (w1[0], w0[3], selector); w2[0] = __byte_perm (w0[3], w0[2], selector); w1[3] = __byte_perm (w0[2], w0[1], selector); w1[2] = __byte_perm (w0[1], w0[0], selector); w1[1] = __byte_perm (w0[0], 0, selector); w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 6: c1[2] = __byte_perm ( 0, w3[3], selector); c1[1] = __byte_perm (w3[3], w3[2], selector); c1[0] = __byte_perm (w3[2], w3[1], selector); c0[3] = __byte_perm (w3[1], w3[0], selector); c0[2] = __byte_perm (w3[0], w2[3], selector); c0[1] = __byte_perm (w2[3], w2[2], selector); c0[0] = __byte_perm (w2[2], w2[1], selector); w3[3] = __byte_perm (w2[1], w2[0], selector); w3[2] = __byte_perm (w2[0], w1[3], selector); w3[1] = __byte_perm (w1[3], w1[2], selector); w3[0] = __byte_perm (w1[2], w1[1], selector); w2[3] = __byte_perm (w1[1], w1[0], selector); w2[2] = __byte_perm (w1[0], w0[3], selector); w2[1] = __byte_perm (w0[3], w0[2], selector); w2[0] = __byte_perm (w0[2], w0[1], selector); w1[3] = __byte_perm (w0[1], w0[0], selector); w1[2] = __byte_perm (w0[0], 0, selector); w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 7: c1[3] = __byte_perm ( 0, w3[3], selector); c1[2] = __byte_perm (w3[3], w3[2], selector); c1[1] = __byte_perm (w3[2], w3[1], selector); c1[0] = __byte_perm (w3[1], w3[0], selector); c0[3] = __byte_perm (w3[0], w2[3], selector); c0[2] = __byte_perm (w2[3], w2[2], selector); c0[1] = __byte_perm (w2[2], w2[1], selector); c0[0] = __byte_perm (w2[1], w2[0], selector); w3[3] = __byte_perm (w2[0], w1[3], selector); w3[2] = __byte_perm (w1[3], w1[2], selector); w3[1] = __byte_perm (w1[2], w1[1], selector); w3[0] = __byte_perm (w1[1], w1[0], selector); w2[3] = __byte_perm (w1[0], w0[3], selector); w2[2] = __byte_perm (w0[3], w0[2], selector); w2[1] = __byte_perm (w0[2], w0[1], selector); w2[0] = __byte_perm (w0[1], w0[0], selector); w1[3] = __byte_perm (w0[0], 0, selector); w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 8: c2[0] = __byte_perm ( 0, w3[3], selector); c1[3] = __byte_perm (w3[3], w3[2], selector); c1[2] = __byte_perm (w3[2], w3[1], selector); c1[1] = __byte_perm (w3[1], w3[0], selector); c1[0] = __byte_perm (w3[0], w2[3], selector); c0[3] = __byte_perm (w2[3], w2[2], selector); c0[2] = __byte_perm (w2[2], w2[1], selector); c0[1] = __byte_perm (w2[1], w2[0], selector); c0[0] = __byte_perm (w2[0], w1[3], selector); w3[3] = __byte_perm (w1[3], w1[2], selector); w3[2] = __byte_perm (w1[2], w1[1], selector); w3[1] = __byte_perm (w1[1], w1[0], selector); w3[0] = __byte_perm (w1[0], w0[3], selector); w2[3] = __byte_perm (w0[3], w0[2], selector); w2[2] = __byte_perm (w0[2], w0[1], selector); w2[1] = __byte_perm (w0[1], w0[0], selector); w2[0] = __byte_perm (w0[0], 0, selector); w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 9: c2[1] = __byte_perm ( 0, w3[3], selector); c2[0] = __byte_perm (w3[3], w3[2], selector); c1[3] = __byte_perm (w3[2], w3[1], selector); c1[2] = __byte_perm (w3[1], w3[0], selector); c1[1] = __byte_perm (w3[0], w2[3], selector); c1[0] = __byte_perm (w2[3], w2[2], selector); c0[3] = __byte_perm (w2[2], w2[1], selector); c0[2] = __byte_perm (w2[1], w2[0], selector); c0[1] = __byte_perm (w2[0], w1[3], selector); c0[0] = __byte_perm (w1[3], w1[2], selector); w3[3] = __byte_perm (w1[2], w1[1], selector); w3[2] = __byte_perm (w1[1], w1[0], selector); w3[1] = __byte_perm (w1[0], w0[3], selector); w3[0] = __byte_perm (w0[3], w0[2], selector); w2[3] = __byte_perm (w0[2], w0[1], selector); w2[2] = __byte_perm (w0[1], w0[0], selector); w2[1] = __byte_perm (w0[0], 0, selector); w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 10: c2[2] = __byte_perm ( 0, w3[3], selector); c2[1] = __byte_perm (w3[3], w3[2], selector); c2[0] = __byte_perm (w3[2], w3[1], selector); c1[3] = __byte_perm (w3[1], w3[0], selector); c1[2] = __byte_perm (w3[0], w2[3], selector); c1[1] = __byte_perm (w2[3], w2[2], selector); c1[0] = __byte_perm (w2[2], w2[1], selector); c0[3] = __byte_perm (w2[1], w2[0], selector); c0[2] = __byte_perm (w2[0], w1[3], selector); c0[1] = __byte_perm (w1[3], w1[2], selector); c0[0] = __byte_perm (w1[2], w1[1], selector); w3[3] = __byte_perm (w1[1], w1[0], selector); w3[2] = __byte_perm (w1[0], w0[3], selector); w3[1] = __byte_perm (w0[3], w0[2], selector); w3[0] = __byte_perm (w0[2], w0[1], selector); w2[3] = __byte_perm (w0[1], w0[0], selector); w2[2] = __byte_perm (w0[0], 0, selector); w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 11: c2[3] = __byte_perm ( 0, w3[3], selector); c2[2] = __byte_perm (w3[3], w3[2], selector); c2[1] = __byte_perm (w3[2], w3[1], selector); c2[0] = __byte_perm (w3[1], w3[0], selector); c1[3] = __byte_perm (w3[0], w2[3], selector); c1[2] = __byte_perm (w2[3], w2[2], selector); c1[1] = __byte_perm (w2[2], w2[1], selector); c1[0] = __byte_perm (w2[1], w2[0], selector); c0[3] = __byte_perm (w2[0], w1[3], selector); c0[2] = __byte_perm (w1[3], w1[2], selector); c0[1] = __byte_perm (w1[2], w1[1], selector); c0[0] = __byte_perm (w1[1], w1[0], selector); w3[3] = __byte_perm (w1[0], w0[3], selector); w3[2] = __byte_perm (w0[3], w0[2], selector); w3[1] = __byte_perm (w0[2], w0[1], selector); w3[0] = __byte_perm (w0[1], w0[0], selector); w2[3] = __byte_perm (w0[0], 0, selector); w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 12: c3[0] = __byte_perm ( 0, w3[3], selector); c2[3] = __byte_perm (w3[3], w3[2], selector); c2[2] = __byte_perm (w3[2], w3[1], selector); c2[1] = __byte_perm (w3[1], w3[0], selector); c2[0] = __byte_perm (w3[0], w2[3], selector); c1[3] = __byte_perm (w2[3], w2[2], selector); c1[2] = __byte_perm (w2[2], w2[1], selector); c1[1] = __byte_perm (w2[1], w2[0], selector); c1[0] = __byte_perm (w2[0], w1[3], selector); c0[3] = __byte_perm (w1[3], w1[2], selector); c0[2] = __byte_perm (w1[2], w1[1], selector); c0[1] = __byte_perm (w1[1], w1[0], selector); c0[0] = __byte_perm (w1[0], w0[3], selector); w3[3] = __byte_perm (w0[3], w0[2], selector); w3[2] = __byte_perm (w0[2], w0[1], selector); w3[1] = __byte_perm (w0[1], w0[0], selector); w3[0] = __byte_perm (w0[0], 0, selector); w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 13: c3[1] = __byte_perm ( 0, w3[3], selector); c3[0] = __byte_perm (w3[3], w3[2], selector); c2[3] = __byte_perm (w3[2], w3[1], selector); c2[2] = __byte_perm (w3[1], w3[0], selector); c2[1] = __byte_perm (w3[0], w2[3], selector); c2[0] = __byte_perm (w2[3], w2[2], selector); c1[3] = __byte_perm (w2[2], w2[1], selector); c1[2] = __byte_perm (w2[1], w2[0], selector); c1[1] = __byte_perm (w2[0], w1[3], selector); c1[0] = __byte_perm (w1[3], w1[2], selector); c0[3] = __byte_perm (w1[2], w1[1], selector); c0[2] = __byte_perm (w1[1], w1[0], selector); c0[1] = __byte_perm (w1[0], w0[3], selector); c0[0] = __byte_perm (w0[3], w0[2], selector); w3[3] = __byte_perm (w0[2], w0[1], selector); w3[2] = __byte_perm (w0[1], w0[0], selector); w3[1] = __byte_perm (w0[0], 0, selector); w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 14: c3[2] = __byte_perm ( 0, w3[3], selector); c3[1] = __byte_perm (w3[3], w3[2], selector); c3[0] = __byte_perm (w3[2], w3[1], selector); c2[3] = __byte_perm (w3[1], w3[0], selector); c2[2] = __byte_perm (w3[0], w2[3], selector); c2[1] = __byte_perm (w2[3], w2[2], selector); c2[0] = __byte_perm (w2[2], w2[1], selector); c1[3] = __byte_perm (w2[1], w2[0], selector); c1[2] = __byte_perm (w2[0], w1[3], selector); c1[1] = __byte_perm (w1[3], w1[2], selector); c1[0] = __byte_perm (w1[2], w1[1], selector); c0[3] = __byte_perm (w1[1], w1[0], selector); c0[2] = __byte_perm (w1[0], w0[3], selector); c0[1] = __byte_perm (w0[3], w0[2], selector); c0[0] = __byte_perm (w0[2], w0[1], selector); w3[3] = __byte_perm (w0[1], w0[0], selector); w3[2] = __byte_perm (w0[0], 0, selector); w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 15: c3[3] = __byte_perm ( 0, w3[3], selector); c3[2] = __byte_perm (w3[3], w3[2], selector); c3[1] = __byte_perm (w3[2], w3[1], selector); c3[0] = __byte_perm (w3[1], w3[0], selector); c2[3] = __byte_perm (w3[0], w2[3], selector); c2[2] = __byte_perm (w2[3], w2[2], selector); c2[1] = __byte_perm (w2[2], w2[1], selector); c2[0] = __byte_perm (w2[1], w2[0], selector); c1[3] = __byte_perm (w2[0], w1[3], selector); c1[2] = __byte_perm (w1[3], w1[2], selector); c1[1] = __byte_perm (w1[2], w1[1], selector); c1[0] = __byte_perm (w1[1], w1[0], selector); c0[3] = __byte_perm (w1[0], w0[3], selector); c0[2] = __byte_perm (w0[3], w0[2], selector); c0[1] = __byte_perm (w0[2], w0[1], selector); c0[0] = __byte_perm (w0[1], w0[0], selector); w3[3] = __byte_perm (w0[0], 0, selector); w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; } #endif } DECLSPEC void switch_buffer_by_offset_8x4_le (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset) { const int offset_mod_4 = offset & 3; const int offset_minus_4 = 4 - offset_mod_4; const int offset_switch = offset / 4; #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC w0[0] = swap32 (w0[0]); w0[1] = swap32 (w0[1]); w0[2] = swap32 (w0[2]); w0[3] = swap32 (w0[3]); w1[0] = swap32 (w1[0]); w1[1] = swap32 (w1[1]); w1[2] = swap32 (w1[2]); w1[3] = swap32 (w1[3]); w2[0] = swap32 (w2[0]); w2[1] = swap32 (w2[1]); w2[2] = swap32 (w2[2]); w2[3] = swap32 (w2[3]); w3[0] = swap32 (w3[0]); w3[1] = swap32 (w3[1]); w3[2] = swap32 (w3[2]); w3[3] = swap32 (w3[3]); w4[0] = swap32 (w4[0]); w4[1] = swap32 (w4[1]); w4[2] = swap32 (w4[2]); w4[3] = swap32 (w4[3]); w5[0] = swap32 (w5[0]); w5[1] = swap32 (w5[1]); w5[2] = swap32 (w5[2]); w5[3] = swap32 (w5[3]); w6[0] = swap32 (w6[0]); w6[1] = swap32 (w6[1]); w6[2] = swap32 (w6[2]); w6[3] = swap32 (w6[3]); w7[0] = swap32 (w7[0]); w7[1] = swap32 (w7[1]); w7[2] = swap32 (w7[2]); w7[3] = swap32 (w7[3]); switch (offset_switch) { case 0: w7[3] = amd_bytealign (w7[2], w7[3], offset); w7[2] = amd_bytealign (w7[1], w7[2], offset); w7[1] = amd_bytealign (w7[0], w7[1], offset); w7[0] = amd_bytealign (w6[3], w7[0], offset); w6[3] = amd_bytealign (w6[2], w6[3], offset); w6[2] = amd_bytealign (w6[1], w6[2], offset); w6[1] = amd_bytealign (w6[0], w6[1], offset); w6[0] = amd_bytealign (w5[3], w6[0], offset); w5[3] = amd_bytealign (w5[2], w5[3], offset); w5[2] = amd_bytealign (w5[1], w5[2], offset); w5[1] = amd_bytealign (w5[0], w5[1], offset); w5[0] = amd_bytealign (w4[3], w5[0], offset); w4[3] = amd_bytealign (w4[2], w4[3], offset); w4[2] = amd_bytealign (w4[1], w4[2], offset); w4[1] = amd_bytealign (w4[0], w4[1], offset); w4[0] = amd_bytealign (w3[3], w4[0], offset); w3[3] = amd_bytealign (w3[2], w3[3], offset); w3[2] = amd_bytealign (w3[1], w3[2], offset); w3[1] = amd_bytealign (w3[0], w3[1], offset); w3[0] = amd_bytealign (w2[3], w3[0], offset); w2[3] = amd_bytealign (w2[2], w2[3], offset); w2[2] = amd_bytealign (w2[1], w2[2], offset); w2[1] = amd_bytealign (w2[0], w2[1], offset); w2[0] = amd_bytealign (w1[3], w2[0], offset); w1[3] = amd_bytealign (w1[2], w1[3], offset); w1[2] = amd_bytealign (w1[1], w1[2], offset); w1[1] = amd_bytealign (w1[0], w1[1], offset); w1[0] = amd_bytealign (w0[3], w1[0], offset); w0[3] = amd_bytealign (w0[2], w0[3], offset); w0[2] = amd_bytealign (w0[1], w0[2], offset); w0[1] = amd_bytealign (w0[0], w0[1], offset); w0[0] = amd_bytealign ( 0, w0[0], offset); break; case 1: w7[3] = amd_bytealign (w7[1], w7[2], offset); w7[2] = amd_bytealign (w7[0], w7[1], offset); w7[1] = amd_bytealign (w6[3], w7[0], offset); w7[0] = amd_bytealign (w6[2], w6[3], offset); w6[3] = amd_bytealign (w6[1], w6[2], offset); w6[2] = amd_bytealign (w6[0], w6[1], offset); w6[1] = amd_bytealign (w5[3], w6[0], offset); w6[0] = amd_bytealign (w5[2], w5[3], offset); w5[3] = amd_bytealign (w5[1], w5[2], offset); w5[2] = amd_bytealign (w5[0], w5[1], offset); w5[1] = amd_bytealign (w4[3], w5[0], offset); w5[0] = amd_bytealign (w4[2], w4[3], offset); w4[3] = amd_bytealign (w4[1], w4[2], offset); w4[2] = amd_bytealign (w4[0], w4[1], offset); w4[1] = amd_bytealign (w3[3], w4[0], offset); w4[0] = amd_bytealign (w3[2], w3[3], offset); w3[3] = amd_bytealign (w3[1], w3[2], offset); w3[2] = amd_bytealign (w3[0], w3[1], offset); w3[1] = amd_bytealign (w2[3], w3[0], offset); w3[0] = amd_bytealign (w2[2], w2[3], offset); w2[3] = amd_bytealign (w2[1], w2[2], offset); w2[2] = amd_bytealign (w2[0], w2[1], offset); w2[1] = amd_bytealign (w1[3], w2[0], offset); w2[0] = amd_bytealign (w1[2], w1[3], offset); w1[3] = amd_bytealign (w1[1], w1[2], offset); w1[2] = amd_bytealign (w1[0], w1[1], offset); w1[1] = amd_bytealign (w0[3], w1[0], offset); w1[0] = amd_bytealign (w0[2], w0[3], offset); w0[3] = amd_bytealign (w0[1], w0[2], offset); w0[2] = amd_bytealign (w0[0], w0[1], offset); w0[1] = amd_bytealign ( 0, w0[0], offset); w0[0] = 0; break; case 2: w7[3] = amd_bytealign (w7[0], w7[1], offset); w7[2] = amd_bytealign (w6[3], w7[0], offset); w7[1] = amd_bytealign (w6[2], w6[3], offset); w7[0] = amd_bytealign (w6[1], w6[2], offset); w6[3] = amd_bytealign (w6[0], w6[1], offset); w6[2] = amd_bytealign (w5[3], w6[0], offset); w6[1] = amd_bytealign (w5[2], w5[3], offset); w6[0] = amd_bytealign (w5[1], w5[2], offset); w5[3] = amd_bytealign (w5[0], w5[1], offset); w5[2] = amd_bytealign (w4[3], w5[0], offset); w5[1] = amd_bytealign (w4[2], w4[3], offset); w5[0] = amd_bytealign (w4[1], w4[2], offset); w4[3] = amd_bytealign (w4[0], w4[1], offset); w4[2] = amd_bytealign (w3[3], w4[0], offset); w4[1] = amd_bytealign (w3[2], w3[3], offset); w4[0] = amd_bytealign (w3[1], w3[2], offset); w3[3] = amd_bytealign (w3[0], w3[1], offset); w3[2] = amd_bytealign (w2[3], w3[0], offset); w3[1] = amd_bytealign (w2[2], w2[3], offset); w3[0] = amd_bytealign (w2[1], w2[2], offset); w2[3] = amd_bytealign (w2[0], w2[1], offset); w2[2] = amd_bytealign (w1[3], w2[0], offset); w2[1] = amd_bytealign (w1[2], w1[3], offset); w2[0] = amd_bytealign (w1[1], w1[2], offset); w1[3] = amd_bytealign (w1[0], w1[1], offset); w1[2] = amd_bytealign (w0[3], w1[0], offset); w1[1] = amd_bytealign (w0[2], w0[3], offset); w1[0] = amd_bytealign (w0[1], w0[2], offset); w0[3] = amd_bytealign (w0[0], w0[1], offset); w0[2] = amd_bytealign ( 0, w0[0], offset); w0[1] = 0; w0[0] = 0; break; case 3: w7[3] = amd_bytealign (w6[3], w7[0], offset); w7[2] = amd_bytealign (w6[2], w6[3], offset); w7[1] = amd_bytealign (w6[1], w6[2], offset); w7[0] = amd_bytealign (w6[0], w6[1], offset); w6[3] = amd_bytealign (w5[3], w6[0], offset); w6[2] = amd_bytealign (w5[2], w5[3], offset); w6[1] = amd_bytealign (w5[1], w5[2], offset); w6[0] = amd_bytealign (w5[0], w5[1], offset); w5[3] = amd_bytealign (w4[3], w5[0], offset); w5[2] = amd_bytealign (w4[2], w4[3], offset); w5[1] = amd_bytealign (w4[1], w4[2], offset); w5[0] = amd_bytealign (w4[0], w4[1], offset); w4[3] = amd_bytealign (w3[3], w4[0], offset); w4[2] = amd_bytealign (w3[2], w3[3], offset); w4[1] = amd_bytealign (w3[1], w3[2], offset); w4[0] = amd_bytealign (w3[0], w3[1], offset); w3[3] = amd_bytealign (w2[3], w3[0], offset); w3[2] = amd_bytealign (w2[2], w2[3], offset); w3[1] = amd_bytealign (w2[1], w2[2], offset); w3[0] = amd_bytealign (w2[0], w2[1], offset); w2[3] = amd_bytealign (w1[3], w2[0], offset); w2[2] = amd_bytealign (w1[2], w1[3], offset); w2[1] = amd_bytealign (w1[1], w1[2], offset); w2[0] = amd_bytealign (w1[0], w1[1], offset); w1[3] = amd_bytealign (w0[3], w1[0], offset); w1[2] = amd_bytealign (w0[2], w0[3], offset); w1[1] = amd_bytealign (w0[1], w0[2], offset); w1[0] = amd_bytealign (w0[0], w0[1], offset); w0[3] = amd_bytealign ( 0, w0[0], offset); w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 4: w7[3] = amd_bytealign (w6[2], w6[3], offset); w7[2] = amd_bytealign (w6[1], w6[2], offset); w7[1] = amd_bytealign (w6[0], w6[1], offset); w7[0] = amd_bytealign (w5[3], w6[0], offset); w6[3] = amd_bytealign (w5[2], w5[3], offset); w6[2] = amd_bytealign (w5[1], w5[2], offset); w6[1] = amd_bytealign (w5[0], w5[1], offset); w6[0] = amd_bytealign (w4[3], w5[0], offset); w5[3] = amd_bytealign (w4[2], w4[3], offset); w5[2] = amd_bytealign (w4[1], w4[2], offset); w5[1] = amd_bytealign (w4[0], w4[1], offset); w5[0] = amd_bytealign (w3[3], w4[0], offset); w4[3] = amd_bytealign (w3[2], w3[3], offset); w4[2] = amd_bytealign (w3[1], w3[2], offset); w4[1] = amd_bytealign (w3[0], w3[1], offset); w4[0] = amd_bytealign (w2[3], w3[0], offset); w3[3] = amd_bytealign (w2[2], w2[3], offset); w3[2] = amd_bytealign (w2[1], w2[2], offset); w3[1] = amd_bytealign (w2[0], w2[1], offset); w3[0] = amd_bytealign (w1[3], w2[0], offset); w2[3] = amd_bytealign (w1[2], w1[3], offset); w2[2] = amd_bytealign (w1[1], w1[2], offset); w2[1] = amd_bytealign (w1[0], w1[1], offset); w2[0] = amd_bytealign (w0[3], w1[0], offset); w1[3] = amd_bytealign (w0[2], w0[3], offset); w1[2] = amd_bytealign (w0[1], w0[2], offset); w1[1] = amd_bytealign (w0[0], w0[1], offset); w1[0] = amd_bytealign ( 0, w0[0], offset); w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 5: w7[3] = amd_bytealign (w6[1], w6[2], offset); w7[2] = amd_bytealign (w6[0], w6[1], offset); w7[1] = amd_bytealign (w5[3], w6[0], offset); w7[0] = amd_bytealign (w5[2], w5[3], offset); w6[3] = amd_bytealign (w5[1], w5[2], offset); w6[2] = amd_bytealign (w5[0], w5[1], offset); w6[1] = amd_bytealign (w4[3], w5[0], offset); w6[0] = amd_bytealign (w4[2], w4[3], offset); w5[3] = amd_bytealign (w4[1], w4[2], offset); w5[2] = amd_bytealign (w4[0], w4[1], offset); w5[1] = amd_bytealign (w3[3], w4[0], offset); w5[0] = amd_bytealign (w3[2], w3[3], offset); w4[3] = amd_bytealign (w3[1], w3[2], offset); w4[2] = amd_bytealign (w3[0], w3[1], offset); w4[1] = amd_bytealign (w2[3], w3[0], offset); w4[0] = amd_bytealign (w2[2], w2[3], offset); w3[3] = amd_bytealign (w2[1], w2[2], offset); w3[2] = amd_bytealign (w2[0], w2[1], offset); w3[1] = amd_bytealign (w1[3], w2[0], offset); w3[0] = amd_bytealign (w1[2], w1[3], offset); w2[3] = amd_bytealign (w1[1], w1[2], offset); w2[2] = amd_bytealign (w1[0], w1[1], offset); w2[1] = amd_bytealign (w0[3], w1[0], offset); w2[0] = amd_bytealign (w0[2], w0[3], offset); w1[3] = amd_bytealign (w0[1], w0[2], offset); w1[2] = amd_bytealign (w0[0], w0[1], offset); w1[1] = amd_bytealign ( 0, w0[0], offset); w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 6: w7[3] = amd_bytealign (w6[0], w6[1], offset); w7[2] = amd_bytealign (w5[3], w6[0], offset); w7[1] = amd_bytealign (w5[2], w5[3], offset); w7[0] = amd_bytealign (w5[1], w5[2], offset); w6[3] = amd_bytealign (w5[0], w5[1], offset); w6[2] = amd_bytealign (w4[3], w5[0], offset); w6[1] = amd_bytealign (w4[2], w4[3], offset); w6[0] = amd_bytealign (w4[1], w4[2], offset); w5[3] = amd_bytealign (w4[0], w4[1], offset); w5[2] = amd_bytealign (w3[3], w4[0], offset); w5[1] = amd_bytealign (w3[2], w3[3], offset); w5[0] = amd_bytealign (w3[1], w3[2], offset); w4[3] = amd_bytealign (w3[0], w3[1], offset); w4[2] = amd_bytealign (w2[3], w3[0], offset); w4[1] = amd_bytealign (w2[2], w2[3], offset); w4[0] = amd_bytealign (w2[1], w2[2], offset); w3[3] = amd_bytealign (w2[0], w2[1], offset); w3[2] = amd_bytealign (w1[3], w2[0], offset); w3[1] = amd_bytealign (w1[2], w1[3], offset); w3[0] = amd_bytealign (w1[1], w1[2], offset); w2[3] = amd_bytealign (w1[0], w1[1], offset); w2[2] = amd_bytealign (w0[3], w1[0], offset); w2[1] = amd_bytealign (w0[2], w0[3], offset); w2[0] = amd_bytealign (w0[1], w0[2], offset); w1[3] = amd_bytealign (w0[0], w0[1], offset); w1[2] = amd_bytealign ( 0, w0[0], offset); w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 7: w7[3] = amd_bytealign (w5[3], w6[0], offset); w7[2] = amd_bytealign (w5[2], w5[3], offset); w7[1] = amd_bytealign (w5[1], w5[2], offset); w7[0] = amd_bytealign (w5[0], w5[1], offset); w6[3] = amd_bytealign (w4[3], w5[0], offset); w6[2] = amd_bytealign (w4[2], w4[3], offset); w6[1] = amd_bytealign (w4[1], w4[2], offset); w6[0] = amd_bytealign (w4[0], w4[1], offset); w5[3] = amd_bytealign (w3[3], w4[0], offset); w5[2] = amd_bytealign (w3[2], w3[3], offset); w5[1] = amd_bytealign (w3[1], w3[2], offset); w5[0] = amd_bytealign (w3[0], w3[1], offset); w4[3] = amd_bytealign (w2[3], w3[0], offset); w4[2] = amd_bytealign (w2[2], w2[3], offset); w4[1] = amd_bytealign (w2[1], w2[2], offset); w4[0] = amd_bytealign (w2[0], w2[1], offset); w3[3] = amd_bytealign (w1[3], w2[0], offset); w3[2] = amd_bytealign (w1[2], w1[3], offset); w3[1] = amd_bytealign (w1[1], w1[2], offset); w3[0] = amd_bytealign (w1[0], w1[1], offset); w2[3] = amd_bytealign (w0[3], w1[0], offset); w2[2] = amd_bytealign (w0[2], w0[3], offset); w2[1] = amd_bytealign (w0[1], w0[2], offset); w2[0] = amd_bytealign (w0[0], w0[1], offset); w1[3] = amd_bytealign ( 0, w0[0], offset); w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 8: w7[3] = amd_bytealign (w5[2], w5[3], offset); w7[2] = amd_bytealign (w5[1], w5[2], offset); w7[1] = amd_bytealign (w5[0], w5[1], offset); w7[0] = amd_bytealign (w4[3], w5[0], offset); w6[3] = amd_bytealign (w4[2], w4[3], offset); w6[2] = amd_bytealign (w4[1], w4[2], offset); w6[1] = amd_bytealign (w4[0], w4[1], offset); w6[0] = amd_bytealign (w3[3], w4[0], offset); w5[3] = amd_bytealign (w3[2], w3[3], offset); w5[2] = amd_bytealign (w3[1], w3[2], offset); w5[1] = amd_bytealign (w3[0], w3[1], offset); w5[0] = amd_bytealign (w2[3], w3[0], offset); w4[3] = amd_bytealign (w2[2], w2[3], offset); w4[2] = amd_bytealign (w2[1], w2[2], offset); w4[1] = amd_bytealign (w2[0], w2[1], offset); w4[0] = amd_bytealign (w1[3], w2[0], offset); w3[3] = amd_bytealign (w1[2], w1[3], offset); w3[2] = amd_bytealign (w1[1], w1[2], offset); w3[1] = amd_bytealign (w1[0], w1[1], offset); w3[0] = amd_bytealign (w0[3], w1[0], offset); w2[3] = amd_bytealign (w0[2], w0[3], offset); w2[2] = amd_bytealign (w0[1], w0[2], offset); w2[1] = amd_bytealign (w0[0], w0[1], offset); w2[0] = amd_bytealign ( 0, w0[0], offset); w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 9: w7[3] = amd_bytealign (w5[1], w5[2], offset); w7[2] = amd_bytealign (w5[0], w5[1], offset); w7[1] = amd_bytealign (w4[3], w5[0], offset); w7[0] = amd_bytealign (w4[2], w4[3], offset); w6[3] = amd_bytealign (w4[1], w4[2], offset); w6[2] = amd_bytealign (w4[0], w4[1], offset); w6[1] = amd_bytealign (w3[3], w4[0], offset); w6[0] = amd_bytealign (w3[2], w3[3], offset); w5[3] = amd_bytealign (w3[1], w3[2], offset); w5[2] = amd_bytealign (w3[0], w3[1], offset); w5[1] = amd_bytealign (w2[3], w3[0], offset); w5[0] = amd_bytealign (w2[2], w2[3], offset); w4[3] = amd_bytealign (w2[1], w2[2], offset); w4[2] = amd_bytealign (w2[0], w2[1], offset); w4[1] = amd_bytealign (w1[3], w2[0], offset); w4[0] = amd_bytealign (w1[2], w1[3], offset); w3[3] = amd_bytealign (w1[1], w1[2], offset); w3[2] = amd_bytealign (w1[0], w1[1], offset); w3[1] = amd_bytealign (w0[3], w1[0], offset); w3[0] = amd_bytealign (w0[2], w0[3], offset); w2[3] = amd_bytealign (w0[1], w0[2], offset); w2[2] = amd_bytealign (w0[0], w0[1], offset); w2[1] = amd_bytealign ( 0, w0[0], offset); w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 10: w7[3] = amd_bytealign (w5[0], w5[1], offset); w7[2] = amd_bytealign (w4[3], w5[0], offset); w7[1] = amd_bytealign (w4[2], w4[3], offset); w7[0] = amd_bytealign (w4[1], w4[2], offset); w6[3] = amd_bytealign (w4[0], w4[1], offset); w6[2] = amd_bytealign (w3[3], w4[0], offset); w6[1] = amd_bytealign (w3[2], w3[3], offset); w6[0] = amd_bytealign (w3[1], w3[2], offset); w5[3] = amd_bytealign (w3[0], w3[1], offset); w5[2] = amd_bytealign (w2[3], w3[0], offset); w5[1] = amd_bytealign (w2[2], w2[3], offset); w5[0] = amd_bytealign (w2[1], w2[2], offset); w4[3] = amd_bytealign (w2[0], w2[1], offset); w4[2] = amd_bytealign (w1[3], w2[0], offset); w4[1] = amd_bytealign (w1[2], w1[3], offset); w4[0] = amd_bytealign (w1[1], w1[2], offset); w3[3] = amd_bytealign (w1[0], w1[1], offset); w3[2] = amd_bytealign (w0[3], w1[0], offset); w3[1] = amd_bytealign (w0[2], w0[3], offset); w3[0] = amd_bytealign (w0[1], w0[2], offset); w2[3] = amd_bytealign (w0[0], w0[1], offset); w2[2] = amd_bytealign ( 0, w0[0], offset); w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 11: w7[3] = amd_bytealign (w4[3], w5[0], offset); w7[2] = amd_bytealign (w4[2], w4[3], offset); w7[1] = amd_bytealign (w4[1], w4[2], offset); w7[0] = amd_bytealign (w4[0], w4[1], offset); w6[3] = amd_bytealign (w3[3], w4[0], offset); w6[2] = amd_bytealign (w3[2], w3[3], offset); w6[1] = amd_bytealign (w3[1], w3[2], offset); w6[0] = amd_bytealign (w3[0], w3[1], offset); w5[3] = amd_bytealign (w2[3], w3[0], offset); w5[2] = amd_bytealign (w2[2], w2[3], offset); w5[1] = amd_bytealign (w2[1], w2[2], offset); w5[0] = amd_bytealign (w2[0], w2[1], offset); w4[3] = amd_bytealign (w1[3], w2[0], offset); w4[2] = amd_bytealign (w1[2], w1[3], offset); w4[1] = amd_bytealign (w1[1], w1[2], offset); w4[0] = amd_bytealign (w1[0], w1[1], offset); w3[3] = amd_bytealign (w0[3], w1[0], offset); w3[2] = amd_bytealign (w0[2], w0[3], offset); w3[1] = amd_bytealign (w0[1], w0[2], offset); w3[0] = amd_bytealign (w0[0], w0[1], offset); w2[3] = amd_bytealign ( 0, w0[0], offset); w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 12: w7[3] = amd_bytealign (w4[2], w4[3], offset); w7[2] = amd_bytealign (w4[1], w4[2], offset); w7[1] = amd_bytealign (w4[0], w4[1], offset); w7[0] = amd_bytealign (w3[3], w4[0], offset); w6[3] = amd_bytealign (w3[2], w3[3], offset); w6[2] = amd_bytealign (w3[1], w3[2], offset); w6[1] = amd_bytealign (w3[0], w3[1], offset); w6[0] = amd_bytealign (w2[3], w3[0], offset); w5[3] = amd_bytealign (w2[2], w2[3], offset); w5[2] = amd_bytealign (w2[1], w2[2], offset); w5[1] = amd_bytealign (w2[0], w2[1], offset); w5[0] = amd_bytealign (w1[3], w2[0], offset); w4[3] = amd_bytealign (w1[2], w1[3], offset); w4[2] = amd_bytealign (w1[1], w1[2], offset); w4[1] = amd_bytealign (w1[0], w1[1], offset); w4[0] = amd_bytealign (w0[3], w1[0], offset); w3[3] = amd_bytealign (w0[2], w0[3], offset); w3[2] = amd_bytealign (w0[1], w0[2], offset); w3[1] = amd_bytealign (w0[0], w0[1], offset); w3[0] = amd_bytealign ( 0, w0[0], offset); w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 13: w7[3] = amd_bytealign (w4[1], w4[2], offset); w7[2] = amd_bytealign (w4[0], w4[1], offset); w7[1] = amd_bytealign (w3[3], w4[0], offset); w7[0] = amd_bytealign (w3[2], w3[3], offset); w6[3] = amd_bytealign (w3[1], w3[2], offset); w6[2] = amd_bytealign (w3[0], w3[1], offset); w6[1] = amd_bytealign (w2[3], w3[0], offset); w6[0] = amd_bytealign (w2[2], w2[3], offset); w5[3] = amd_bytealign (w2[1], w2[2], offset); w5[2] = amd_bytealign (w2[0], w2[1], offset); w5[1] = amd_bytealign (w1[3], w2[0], offset); w5[0] = amd_bytealign (w1[2], w1[3], offset); w4[3] = amd_bytealign (w1[1], w1[2], offset); w4[2] = amd_bytealign (w1[0], w1[1], offset); w4[1] = amd_bytealign (w0[3], w1[0], offset); w4[0] = amd_bytealign (w0[2], w0[3], offset); w3[3] = amd_bytealign (w0[1], w0[2], offset); w3[2] = amd_bytealign (w0[0], w0[1], offset); w3[1] = amd_bytealign ( 0, w0[0], offset); w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 14: w7[3] = amd_bytealign (w4[0], w4[1], offset); w7[2] = amd_bytealign (w3[3], w4[0], offset); w7[1] = amd_bytealign (w3[2], w3[3], offset); w7[0] = amd_bytealign (w3[1], w3[2], offset); w6[3] = amd_bytealign (w3[0], w3[1], offset); w6[2] = amd_bytealign (w2[3], w3[0], offset); w6[1] = amd_bytealign (w2[2], w2[3], offset); w6[0] = amd_bytealign (w2[1], w2[2], offset); w5[3] = amd_bytealign (w2[0], w2[1], offset); w5[2] = amd_bytealign (w1[3], w2[0], offset); w5[1] = amd_bytealign (w1[2], w1[3], offset); w5[0] = amd_bytealign (w1[1], w1[2], offset); w4[3] = amd_bytealign (w1[0], w1[1], offset); w4[2] = amd_bytealign (w0[3], w1[0], offset); w4[1] = amd_bytealign (w0[2], w0[3], offset); w4[0] = amd_bytealign (w0[1], w0[2], offset); w3[3] = amd_bytealign (w0[0], w0[1], offset); w3[2] = amd_bytealign ( 0, w0[0], offset); w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 15: w7[3] = amd_bytealign (w3[3], w4[0], offset); w7[2] = amd_bytealign (w3[2], w3[3], offset); w7[1] = amd_bytealign (w3[1], w3[2], offset); w7[0] = amd_bytealign (w3[0], w3[1], offset); w6[3] = amd_bytealign (w2[3], w3[0], offset); w6[2] = amd_bytealign (w2[2], w2[3], offset); w6[1] = amd_bytealign (w2[1], w2[2], offset); w6[0] = amd_bytealign (w2[0], w2[1], offset); w5[3] = amd_bytealign (w1[3], w2[0], offset); w5[2] = amd_bytealign (w1[2], w1[3], offset); w5[1] = amd_bytealign (w1[1], w1[2], offset); w5[0] = amd_bytealign (w1[0], w1[1], offset); w4[3] = amd_bytealign (w0[3], w1[0], offset); w4[2] = amd_bytealign (w0[2], w0[3], offset); w4[1] = amd_bytealign (w0[1], w0[2], offset); w4[0] = amd_bytealign (w0[0], w0[1], offset); w3[3] = amd_bytealign ( 0, w0[0], offset); w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 16: w7[3] = amd_bytealign (w3[2], w3[3], offset); w7[2] = amd_bytealign (w3[1], w3[2], offset); w7[1] = amd_bytealign (w3[0], w3[1], offset); w7[0] = amd_bytealign (w2[3], w3[0], offset); w6[3] = amd_bytealign (w2[2], w2[3], offset); w6[2] = amd_bytealign (w2[1], w2[2], offset); w6[1] = amd_bytealign (w2[0], w2[1], offset); w6[0] = amd_bytealign (w1[3], w2[0], offset); w5[3] = amd_bytealign (w1[2], w1[3], offset); w5[2] = amd_bytealign (w1[1], w1[2], offset); w5[1] = amd_bytealign (w1[0], w1[1], offset); w5[0] = amd_bytealign (w0[3], w1[0], offset); w4[3] = amd_bytealign (w0[2], w0[3], offset); w4[2] = amd_bytealign (w0[1], w0[2], offset); w4[1] = amd_bytealign (w0[0], w0[1], offset); w4[0] = amd_bytealign ( 0, w0[0], offset); w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 17: w7[3] = amd_bytealign (w3[1], w3[2], offset); w7[2] = amd_bytealign (w3[0], w3[1], offset); w7[1] = amd_bytealign (w2[3], w3[0], offset); w7[0] = amd_bytealign (w2[2], w2[3], offset); w6[3] = amd_bytealign (w2[1], w2[2], offset); w6[2] = amd_bytealign (w2[0], w2[1], offset); w6[1] = amd_bytealign (w1[3], w2[0], offset); w6[0] = amd_bytealign (w1[2], w1[3], offset); w5[3] = amd_bytealign (w1[1], w1[2], offset); w5[2] = amd_bytealign (w1[0], w1[1], offset); w5[1] = amd_bytealign (w0[3], w1[0], offset); w5[0] = amd_bytealign (w0[2], w0[3], offset); w4[3] = amd_bytealign (w0[1], w0[2], offset); w4[2] = amd_bytealign (w0[0], w0[1], offset); w4[1] = amd_bytealign ( 0, w0[0], offset); w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 18: w7[3] = amd_bytealign (w3[0], w3[1], offset); w7[2] = amd_bytealign (w2[3], w3[0], offset); w7[1] = amd_bytealign (w2[2], w2[3], offset); w7[0] = amd_bytealign (w2[1], w2[2], offset); w6[3] = amd_bytealign (w2[0], w2[1], offset); w6[2] = amd_bytealign (w1[3], w2[0], offset); w6[1] = amd_bytealign (w1[2], w1[3], offset); w6[0] = amd_bytealign (w1[1], w1[2], offset); w5[3] = amd_bytealign (w1[0], w1[1], offset); w5[2] = amd_bytealign (w0[3], w1[0], offset); w5[1] = amd_bytealign (w0[2], w0[3], offset); w5[0] = amd_bytealign (w0[1], w0[2], offset); w4[3] = amd_bytealign (w0[0], w0[1], offset); w4[2] = amd_bytealign ( 0, w0[0], offset); w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 19: w7[3] = amd_bytealign (w2[3], w3[0], offset); w7[2] = amd_bytealign (w2[2], w2[3], offset); w7[1] = amd_bytealign (w2[1], w2[2], offset); w7[0] = amd_bytealign (w2[0], w2[1], offset); w6[3] = amd_bytealign (w1[3], w2[0], offset); w6[2] = amd_bytealign (w1[2], w1[3], offset); w6[1] = amd_bytealign (w1[1], w1[2], offset); w6[0] = amd_bytealign (w1[0], w1[1], offset); w5[3] = amd_bytealign (w0[3], w1[0], offset); w5[2] = amd_bytealign (w0[2], w0[3], offset); w5[1] = amd_bytealign (w0[1], w0[2], offset); w5[0] = amd_bytealign (w0[0], w0[1], offset); w4[3] = amd_bytealign ( 0, w0[0], offset); w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 20: w7[3] = amd_bytealign (w2[2], w2[3], offset); w7[2] = amd_bytealign (w2[1], w2[2], offset); w7[1] = amd_bytealign (w2[0], w2[1], offset); w7[0] = amd_bytealign (w1[3], w2[0], offset); w6[3] = amd_bytealign (w1[2], w1[3], offset); w6[2] = amd_bytealign (w1[1], w1[2], offset); w6[1] = amd_bytealign (w1[0], w1[1], offset); w6[0] = amd_bytealign (w0[3], w1[0], offset); w5[3] = amd_bytealign (w0[2], w0[3], offset); w5[2] = amd_bytealign (w0[1], w0[2], offset); w5[1] = amd_bytealign (w0[0], w0[1], offset); w5[0] = amd_bytealign ( 0, w0[0], offset); w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 21: w7[3] = amd_bytealign (w2[1], w2[2], offset); w7[2] = amd_bytealign (w2[0], w2[1], offset); w7[1] = amd_bytealign (w1[3], w2[0], offset); w7[0] = amd_bytealign (w1[2], w1[3], offset); w6[3] = amd_bytealign (w1[1], w1[2], offset); w6[2] = amd_bytealign (w1[0], w1[1], offset); w6[1] = amd_bytealign (w0[3], w1[0], offset); w6[0] = amd_bytealign (w0[2], w0[3], offset); w5[3] = amd_bytealign (w0[1], w0[2], offset); w5[2] = amd_bytealign (w0[0], w0[1], offset); w5[1] = amd_bytealign ( 0, w0[0], offset); w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 22: w7[3] = amd_bytealign (w2[0], w2[1], offset); w7[2] = amd_bytealign (w1[3], w2[0], offset); w7[1] = amd_bytealign (w1[2], w1[3], offset); w7[0] = amd_bytealign (w1[1], w1[2], offset); w6[3] = amd_bytealign (w1[0], w1[1], offset); w6[2] = amd_bytealign (w0[3], w1[0], offset); w6[1] = amd_bytealign (w0[2], w0[3], offset); w6[0] = amd_bytealign (w0[1], w0[2], offset); w5[3] = amd_bytealign (w0[0], w0[1], offset); w5[2] = amd_bytealign ( 0, w0[0], offset); w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 23: w7[3] = amd_bytealign (w1[3], w2[0], offset); w7[2] = amd_bytealign (w1[2], w1[3], offset); w7[1] = amd_bytealign (w1[1], w1[2], offset); w7[0] = amd_bytealign (w1[0], w1[1], offset); w6[3] = amd_bytealign (w0[3], w1[0], offset); w6[2] = amd_bytealign (w0[2], w0[3], offset); w6[1] = amd_bytealign (w0[1], w0[2], offset); w6[0] = amd_bytealign (w0[0], w0[1], offset); w5[3] = amd_bytealign ( 0, w0[0], offset); w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 24: w7[3] = amd_bytealign (w1[2], w1[3], offset); w7[2] = amd_bytealign (w1[1], w1[2], offset); w7[1] = amd_bytealign (w1[0], w1[1], offset); w7[0] = amd_bytealign (w0[3], w1[0], offset); w6[3] = amd_bytealign (w0[2], w0[3], offset); w6[2] = amd_bytealign (w0[1], w0[2], offset); w6[1] = amd_bytealign (w0[0], w0[1], offset); w6[0] = amd_bytealign ( 0, w0[0], offset); w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 25: w7[3] = amd_bytealign (w1[1], w1[2], offset); w7[2] = amd_bytealign (w1[0], w1[1], offset); w7[1] = amd_bytealign (w0[3], w1[0], offset); w7[0] = amd_bytealign (w0[2], w0[3], offset); w6[3] = amd_bytealign (w0[1], w0[2], offset); w6[2] = amd_bytealign (w0[0], w0[1], offset); w6[1] = amd_bytealign ( 0, w0[0], offset); w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 26: w7[3] = amd_bytealign (w1[0], w1[1], offset); w7[2] = amd_bytealign (w0[3], w1[0], offset); w7[1] = amd_bytealign (w0[2], w0[3], offset); w7[0] = amd_bytealign (w0[1], w0[2], offset); w6[3] = amd_bytealign (w0[0], w0[1], offset); w6[2] = amd_bytealign ( 0, w0[0], offset); w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 27: w7[3] = amd_bytealign (w0[3], w1[0], offset); w7[2] = amd_bytealign (w0[2], w0[3], offset); w7[1] = amd_bytealign (w0[1], w0[2], offset); w7[0] = amd_bytealign (w0[0], w0[1], offset); w6[3] = amd_bytealign ( 0, w0[0], offset); w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 28: w7[3] = amd_bytealign (w0[2], w0[3], offset); w7[2] = amd_bytealign (w0[1], w0[2], offset); w7[1] = amd_bytealign (w0[0], w0[1], offset); w7[0] = amd_bytealign ( 0, w0[0], offset); w6[3] = 0; w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 29: w7[3] = amd_bytealign (w0[1], w0[2], offset); w7[2] = amd_bytealign (w0[0], w0[1], offset); w7[1] = amd_bytealign ( 0, w0[0], offset); w7[0] = 0; w6[3] = 0; w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 30: w7[3] = amd_bytealign (w0[0], w0[1], offset); w7[2] = amd_bytealign ( 0, w0[0], offset); w7[1] = 0; w7[0] = 0; w6[3] = 0; w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 31: w7[3] = amd_bytealign ( 0, w0[0], offset); w7[2] = 0; w7[1] = 0; w7[0] = 0; w6[3] = 0; w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; } w0[0] = swap32 (w0[0]); w0[1] = swap32 (w0[1]); w0[2] = swap32 (w0[2]); w0[3] = swap32 (w0[3]); w1[0] = swap32 (w1[0]); w1[1] = swap32 (w1[1]); w1[2] = swap32 (w1[2]); w1[3] = swap32 (w1[3]); w2[0] = swap32 (w2[0]); w2[1] = swap32 (w2[1]); w2[2] = swap32 (w2[2]); w2[3] = swap32 (w2[3]); w3[0] = swap32 (w3[0]); w3[1] = swap32 (w3[1]); w3[2] = swap32 (w3[2]); w3[3] = swap32 (w3[3]); w4[0] = swap32 (w4[0]); w4[1] = swap32 (w4[1]); w4[2] = swap32 (w4[2]); w4[3] = swap32 (w4[3]); w5[0] = swap32 (w5[0]); w5[1] = swap32 (w5[1]); w5[2] = swap32 (w5[2]); w5[3] = swap32 (w5[3]); w6[0] = swap32 (w6[0]); w6[1] = swap32 (w6[1]); w6[2] = swap32 (w6[2]); w6[3] = swap32 (w6[3]); w7[0] = swap32 (w7[0]); w7[1] = swap32 (w7[1]); w7[2] = swap32 (w7[2]); w7[3] = swap32 (w7[3]); #endif #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV #if defined IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; #endif #if defined IS_AMD const int selector = 0x0706050403020100 >> (offset_minus_4 * 8); #endif switch (offset_switch) { case 0: w7[3] = __byte_perm (w7[2], w7[3], selector); w7[2] = __byte_perm (w7[1], w7[2], selector); w7[1] = __byte_perm (w7[0], w7[1], selector); w7[0] = __byte_perm (w6[3], w7[0], selector); w6[3] = __byte_perm (w6[2], w6[3], selector); w6[2] = __byte_perm (w6[1], w6[2], selector); w6[1] = __byte_perm (w6[0], w6[1], selector); w6[0] = __byte_perm (w5[3], w6[0], selector); w5[3] = __byte_perm (w5[2], w5[3], selector); w5[2] = __byte_perm (w5[1], w5[2], selector); w5[1] = __byte_perm (w5[0], w5[1], selector); w5[0] = __byte_perm (w4[3], w5[0], selector); w4[3] = __byte_perm (w4[2], w4[3], selector); w4[2] = __byte_perm (w4[1], w4[2], selector); w4[1] = __byte_perm (w4[0], w4[1], selector); w4[0] = __byte_perm (w3[3], w4[0], selector); w3[3] = __byte_perm (w3[2], w3[3], selector); w3[2] = __byte_perm (w3[1], w3[2], selector); w3[1] = __byte_perm (w3[0], w3[1], selector); w3[0] = __byte_perm (w2[3], w3[0], selector); w2[3] = __byte_perm (w2[2], w2[3], selector); w2[2] = __byte_perm (w2[1], w2[2], selector); w2[1] = __byte_perm (w2[0], w2[1], selector); w2[0] = __byte_perm (w1[3], w2[0], selector); w1[3] = __byte_perm (w1[2], w1[3], selector); w1[2] = __byte_perm (w1[1], w1[2], selector); w1[1] = __byte_perm (w1[0], w1[1], selector); w1[0] = __byte_perm (w0[3], w1[0], selector); w0[3] = __byte_perm (w0[2], w0[3], selector); w0[2] = __byte_perm (w0[1], w0[2], selector); w0[1] = __byte_perm (w0[0], w0[1], selector); w0[0] = __byte_perm ( 0, w0[0], selector); break; case 1: w7[3] = __byte_perm (w7[1], w7[2], selector); w7[2] = __byte_perm (w7[0], w7[1], selector); w7[1] = __byte_perm (w6[3], w7[0], selector); w7[0] = __byte_perm (w6[2], w6[3], selector); w6[3] = __byte_perm (w6[1], w6[2], selector); w6[2] = __byte_perm (w6[0], w6[1], selector); w6[1] = __byte_perm (w5[3], w6[0], selector); w6[0] = __byte_perm (w5[2], w5[3], selector); w5[3] = __byte_perm (w5[1], w5[2], selector); w5[2] = __byte_perm (w5[0], w5[1], selector); w5[1] = __byte_perm (w4[3], w5[0], selector); w5[0] = __byte_perm (w4[2], w4[3], selector); w4[3] = __byte_perm (w4[1], w4[2], selector); w4[2] = __byte_perm (w4[0], w4[1], selector); w4[1] = __byte_perm (w3[3], w4[0], selector); w4[0] = __byte_perm (w3[2], w3[3], selector); w3[3] = __byte_perm (w3[1], w3[2], selector); w3[2] = __byte_perm (w3[0], w3[1], selector); w3[1] = __byte_perm (w2[3], w3[0], selector); w3[0] = __byte_perm (w2[2], w2[3], selector); w2[3] = __byte_perm (w2[1], w2[2], selector); w2[2] = __byte_perm (w2[0], w2[1], selector); w2[1] = __byte_perm (w1[3], w2[0], selector); w2[0] = __byte_perm (w1[2], w1[3], selector); w1[3] = __byte_perm (w1[1], w1[2], selector); w1[2] = __byte_perm (w1[0], w1[1], selector); w1[1] = __byte_perm (w0[3], w1[0], selector); w1[0] = __byte_perm (w0[2], w0[3], selector); w0[3] = __byte_perm (w0[1], w0[2], selector); w0[2] = __byte_perm (w0[0], w0[1], selector); w0[1] = __byte_perm ( 0, w0[0], selector); w0[0] = 0; break; case 2: w7[3] = __byte_perm (w7[0], w7[1], selector); w7[2] = __byte_perm (w6[3], w7[0], selector); w7[1] = __byte_perm (w6[2], w6[3], selector); w7[0] = __byte_perm (w6[1], w6[2], selector); w6[3] = __byte_perm (w6[0], w6[1], selector); w6[2] = __byte_perm (w5[3], w6[0], selector); w6[1] = __byte_perm (w5[2], w5[3], selector); w6[0] = __byte_perm (w5[1], w5[2], selector); w5[3] = __byte_perm (w5[0], w5[1], selector); w5[2] = __byte_perm (w4[3], w5[0], selector); w5[1] = __byte_perm (w4[2], w4[3], selector); w5[0] = __byte_perm (w4[1], w4[2], selector); w4[3] = __byte_perm (w4[0], w4[1], selector); w4[2] = __byte_perm (w3[3], w4[0], selector); w4[1] = __byte_perm (w3[2], w3[3], selector); w4[0] = __byte_perm (w3[1], w3[2], selector); w3[3] = __byte_perm (w3[0], w3[1], selector); w3[2] = __byte_perm (w2[3], w3[0], selector); w3[1] = __byte_perm (w2[2], w2[3], selector); w3[0] = __byte_perm (w2[1], w2[2], selector); w2[3] = __byte_perm (w2[0], w2[1], selector); w2[2] = __byte_perm (w1[3], w2[0], selector); w2[1] = __byte_perm (w1[2], w1[3], selector); w2[0] = __byte_perm (w1[1], w1[2], selector); w1[3] = __byte_perm (w1[0], w1[1], selector); w1[2] = __byte_perm (w0[3], w1[0], selector); w1[1] = __byte_perm (w0[2], w0[3], selector); w1[0] = __byte_perm (w0[1], w0[2], selector); w0[3] = __byte_perm (w0[0], w0[1], selector); w0[2] = __byte_perm ( 0, w0[0], selector); w0[1] = 0; w0[0] = 0; break; case 3: w7[3] = __byte_perm (w6[3], w7[0], selector); w7[2] = __byte_perm (w6[2], w6[3], selector); w7[1] = __byte_perm (w6[1], w6[2], selector); w7[0] = __byte_perm (w6[0], w6[1], selector); w6[3] = __byte_perm (w5[3], w6[0], selector); w6[2] = __byte_perm (w5[2], w5[3], selector); w6[1] = __byte_perm (w5[1], w5[2], selector); w6[0] = __byte_perm (w5[0], w5[1], selector); w5[3] = __byte_perm (w4[3], w5[0], selector); w5[2] = __byte_perm (w4[2], w4[3], selector); w5[1] = __byte_perm (w4[1], w4[2], selector); w5[0] = __byte_perm (w4[0], w4[1], selector); w4[3] = __byte_perm (w3[3], w4[0], selector); w4[2] = __byte_perm (w3[2], w3[3], selector); w4[1] = __byte_perm (w3[1], w3[2], selector); w4[0] = __byte_perm (w3[0], w3[1], selector); w3[3] = __byte_perm (w2[3], w3[0], selector); w3[2] = __byte_perm (w2[2], w2[3], selector); w3[1] = __byte_perm (w2[1], w2[2], selector); w3[0] = __byte_perm (w2[0], w2[1], selector); w2[3] = __byte_perm (w1[3], w2[0], selector); w2[2] = __byte_perm (w1[2], w1[3], selector); w2[1] = __byte_perm (w1[1], w1[2], selector); w2[0] = __byte_perm (w1[0], w1[1], selector); w1[3] = __byte_perm (w0[3], w1[0], selector); w1[2] = __byte_perm (w0[2], w0[3], selector); w1[1] = __byte_perm (w0[1], w0[2], selector); w1[0] = __byte_perm (w0[0], w0[1], selector); w0[3] = __byte_perm ( 0, w0[0], selector); w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 4: w7[3] = __byte_perm (w6[2], w6[3], selector); w7[2] = __byte_perm (w6[1], w6[2], selector); w7[1] = __byte_perm (w6[0], w6[1], selector); w7[0] = __byte_perm (w5[3], w6[0], selector); w6[3] = __byte_perm (w5[2], w5[3], selector); w6[2] = __byte_perm (w5[1], w5[2], selector); w6[1] = __byte_perm (w5[0], w5[1], selector); w6[0] = __byte_perm (w4[3], w5[0], selector); w5[3] = __byte_perm (w4[2], w4[3], selector); w5[2] = __byte_perm (w4[1], w4[2], selector); w5[1] = __byte_perm (w4[0], w4[1], selector); w5[0] = __byte_perm (w3[3], w4[0], selector); w4[3] = __byte_perm (w3[2], w3[3], selector); w4[2] = __byte_perm (w3[1], w3[2], selector); w4[1] = __byte_perm (w3[0], w3[1], selector); w4[0] = __byte_perm (w2[3], w3[0], selector); w3[3] = __byte_perm (w2[2], w2[3], selector); w3[2] = __byte_perm (w2[1], w2[2], selector); w3[1] = __byte_perm (w2[0], w2[1], selector); w3[0] = __byte_perm (w1[3], w2[0], selector); w2[3] = __byte_perm (w1[2], w1[3], selector); w2[2] = __byte_perm (w1[1], w1[2], selector); w2[1] = __byte_perm (w1[0], w1[1], selector); w2[0] = __byte_perm (w0[3], w1[0], selector); w1[3] = __byte_perm (w0[2], w0[3], selector); w1[2] = __byte_perm (w0[1], w0[2], selector); w1[1] = __byte_perm (w0[0], w0[1], selector); w1[0] = __byte_perm ( 0, w0[0], selector); w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 5: w7[3] = __byte_perm (w6[1], w6[2], selector); w7[2] = __byte_perm (w6[0], w6[1], selector); w7[1] = __byte_perm (w5[3], w6[0], selector); w7[0] = __byte_perm (w5[2], w5[3], selector); w6[3] = __byte_perm (w5[1], w5[2], selector); w6[2] = __byte_perm (w5[0], w5[1], selector); w6[1] = __byte_perm (w4[3], w5[0], selector); w6[0] = __byte_perm (w4[2], w4[3], selector); w5[3] = __byte_perm (w4[1], w4[2], selector); w5[2] = __byte_perm (w4[0], w4[1], selector); w5[1] = __byte_perm (w3[3], w4[0], selector); w5[0] = __byte_perm (w3[2], w3[3], selector); w4[3] = __byte_perm (w3[1], w3[2], selector); w4[2] = __byte_perm (w3[0], w3[1], selector); w4[1] = __byte_perm (w2[3], w3[0], selector); w4[0] = __byte_perm (w2[2], w2[3], selector); w3[3] = __byte_perm (w2[1], w2[2], selector); w3[2] = __byte_perm (w2[0], w2[1], selector); w3[1] = __byte_perm (w1[3], w2[0], selector); w3[0] = __byte_perm (w1[2], w1[3], selector); w2[3] = __byte_perm (w1[1], w1[2], selector); w2[2] = __byte_perm (w1[0], w1[1], selector); w2[1] = __byte_perm (w0[3], w1[0], selector); w2[0] = __byte_perm (w0[2], w0[3], selector); w1[3] = __byte_perm (w0[1], w0[2], selector); w1[2] = __byte_perm (w0[0], w0[1], selector); w1[1] = __byte_perm ( 0, w0[0], selector); w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 6: w7[3] = __byte_perm (w6[0], w6[1], selector); w7[2] = __byte_perm (w5[3], w6[0], selector); w7[1] = __byte_perm (w5[2], w5[3], selector); w7[0] = __byte_perm (w5[1], w5[2], selector); w6[3] = __byte_perm (w5[0], w5[1], selector); w6[2] = __byte_perm (w4[3], w5[0], selector); w6[1] = __byte_perm (w4[2], w4[3], selector); w6[0] = __byte_perm (w4[1], w4[2], selector); w5[3] = __byte_perm (w4[0], w4[1], selector); w5[2] = __byte_perm (w3[3], w4[0], selector); w5[1] = __byte_perm (w3[2], w3[3], selector); w5[0] = __byte_perm (w3[1], w3[2], selector); w4[3] = __byte_perm (w3[0], w3[1], selector); w4[2] = __byte_perm (w2[3], w3[0], selector); w4[1] = __byte_perm (w2[2], w2[3], selector); w4[0] = __byte_perm (w2[1], w2[2], selector); w3[3] = __byte_perm (w2[0], w2[1], selector); w3[2] = __byte_perm (w1[3], w2[0], selector); w3[1] = __byte_perm (w1[2], w1[3], selector); w3[0] = __byte_perm (w1[1], w1[2], selector); w2[3] = __byte_perm (w1[0], w1[1], selector); w2[2] = __byte_perm (w0[3], w1[0], selector); w2[1] = __byte_perm (w0[2], w0[3], selector); w2[0] = __byte_perm (w0[1], w0[2], selector); w1[3] = __byte_perm (w0[0], w0[1], selector); w1[2] = __byte_perm ( 0, w0[0], selector); w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 7: w7[3] = __byte_perm (w5[3], w6[0], selector); w7[2] = __byte_perm (w5[2], w5[3], selector); w7[1] = __byte_perm (w5[1], w5[2], selector); w7[0] = __byte_perm (w5[0], w5[1], selector); w6[3] = __byte_perm (w4[3], w5[0], selector); w6[2] = __byte_perm (w4[2], w4[3], selector); w6[1] = __byte_perm (w4[1], w4[2], selector); w6[0] = __byte_perm (w4[0], w4[1], selector); w5[3] = __byte_perm (w3[3], w4[0], selector); w5[2] = __byte_perm (w3[2], w3[3], selector); w5[1] = __byte_perm (w3[1], w3[2], selector); w5[0] = __byte_perm (w3[0], w3[1], selector); w4[3] = __byte_perm (w2[3], w3[0], selector); w4[2] = __byte_perm (w2[2], w2[3], selector); w4[1] = __byte_perm (w2[1], w2[2], selector); w4[0] = __byte_perm (w2[0], w2[1], selector); w3[3] = __byte_perm (w1[3], w2[0], selector); w3[2] = __byte_perm (w1[2], w1[3], selector); w3[1] = __byte_perm (w1[1], w1[2], selector); w3[0] = __byte_perm (w1[0], w1[1], selector); w2[3] = __byte_perm (w0[3], w1[0], selector); w2[2] = __byte_perm (w0[2], w0[3], selector); w2[1] = __byte_perm (w0[1], w0[2], selector); w2[0] = __byte_perm (w0[0], w0[1], selector); w1[3] = __byte_perm ( 0, w0[0], selector); w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 8: w7[3] = __byte_perm (w5[2], w5[3], selector); w7[2] = __byte_perm (w5[1], w5[2], selector); w7[1] = __byte_perm (w5[0], w5[1], selector); w7[0] = __byte_perm (w4[3], w5[0], selector); w6[3] = __byte_perm (w4[2], w4[3], selector); w6[2] = __byte_perm (w4[1], w4[2], selector); w6[1] = __byte_perm (w4[0], w4[1], selector); w6[0] = __byte_perm (w3[3], w4[0], selector); w5[3] = __byte_perm (w3[2], w3[3], selector); w5[2] = __byte_perm (w3[1], w3[2], selector); w5[1] = __byte_perm (w3[0], w3[1], selector); w5[0] = __byte_perm (w2[3], w3[0], selector); w4[3] = __byte_perm (w2[2], w2[3], selector); w4[2] = __byte_perm (w2[1], w2[2], selector); w4[1] = __byte_perm (w2[0], w2[1], selector); w4[0] = __byte_perm (w1[3], w2[0], selector); w3[3] = __byte_perm (w1[2], w1[3], selector); w3[2] = __byte_perm (w1[1], w1[2], selector); w3[1] = __byte_perm (w1[0], w1[1], selector); w3[0] = __byte_perm (w0[3], w1[0], selector); w2[3] = __byte_perm (w0[2], w0[3], selector); w2[2] = __byte_perm (w0[1], w0[2], selector); w2[1] = __byte_perm (w0[0], w0[1], selector); w2[0] = __byte_perm ( 0, w0[0], selector); w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 9: w7[3] = __byte_perm (w5[1], w5[2], selector); w7[2] = __byte_perm (w5[0], w5[1], selector); w7[1] = __byte_perm (w4[3], w5[0], selector); w7[0] = __byte_perm (w4[2], w4[3], selector); w6[3] = __byte_perm (w4[1], w4[2], selector); w6[2] = __byte_perm (w4[0], w4[1], selector); w6[1] = __byte_perm (w3[3], w4[0], selector); w6[0] = __byte_perm (w3[2], w3[3], selector); w5[3] = __byte_perm (w3[1], w3[2], selector); w5[2] = __byte_perm (w3[0], w3[1], selector); w5[1] = __byte_perm (w2[3], w3[0], selector); w5[0] = __byte_perm (w2[2], w2[3], selector); w4[3] = __byte_perm (w2[1], w2[2], selector); w4[2] = __byte_perm (w2[0], w2[1], selector); w4[1] = __byte_perm (w1[3], w2[0], selector); w4[0] = __byte_perm (w1[2], w1[3], selector); w3[3] = __byte_perm (w1[1], w1[2], selector); w3[2] = __byte_perm (w1[0], w1[1], selector); w3[1] = __byte_perm (w0[3], w1[0], selector); w3[0] = __byte_perm (w0[2], w0[3], selector); w2[3] = __byte_perm (w0[1], w0[2], selector); w2[2] = __byte_perm (w0[0], w0[1], selector); w2[1] = __byte_perm ( 0, w0[0], selector); w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 10: w7[3] = __byte_perm (w5[0], w5[1], selector); w7[2] = __byte_perm (w4[3], w5[0], selector); w7[1] = __byte_perm (w4[2], w4[3], selector); w7[0] = __byte_perm (w4[1], w4[2], selector); w6[3] = __byte_perm (w4[0], w4[1], selector); w6[2] = __byte_perm (w3[3], w4[0], selector); w6[1] = __byte_perm (w3[2], w3[3], selector); w6[0] = __byte_perm (w3[1], w3[2], selector); w5[3] = __byte_perm (w3[0], w3[1], selector); w5[2] = __byte_perm (w2[3], w3[0], selector); w5[1] = __byte_perm (w2[2], w2[3], selector); w5[0] = __byte_perm (w2[1], w2[2], selector); w4[3] = __byte_perm (w2[0], w2[1], selector); w4[2] = __byte_perm (w1[3], w2[0], selector); w4[1] = __byte_perm (w1[2], w1[3], selector); w4[0] = __byte_perm (w1[1], w1[2], selector); w3[3] = __byte_perm (w1[0], w1[1], selector); w3[2] = __byte_perm (w0[3], w1[0], selector); w3[1] = __byte_perm (w0[2], w0[3], selector); w3[0] = __byte_perm (w0[1], w0[2], selector); w2[3] = __byte_perm (w0[0], w0[1], selector); w2[2] = __byte_perm ( 0, w0[0], selector); w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 11: w7[3] = __byte_perm (w4[3], w5[0], selector); w7[2] = __byte_perm (w4[2], w4[3], selector); w7[1] = __byte_perm (w4[1], w4[2], selector); w7[0] = __byte_perm (w4[0], w4[1], selector); w6[3] = __byte_perm (w3[3], w4[0], selector); w6[2] = __byte_perm (w3[2], w3[3], selector); w6[1] = __byte_perm (w3[1], w3[2], selector); w6[0] = __byte_perm (w3[0], w3[1], selector); w5[3] = __byte_perm (w2[3], w3[0], selector); w5[2] = __byte_perm (w2[2], w2[3], selector); w5[1] = __byte_perm (w2[1], w2[2], selector); w5[0] = __byte_perm (w2[0], w2[1], selector); w4[3] = __byte_perm (w1[3], w2[0], selector); w4[2] = __byte_perm (w1[2], w1[3], selector); w4[1] = __byte_perm (w1[1], w1[2], selector); w4[0] = __byte_perm (w1[0], w1[1], selector); w3[3] = __byte_perm (w0[3], w1[0], selector); w3[2] = __byte_perm (w0[2], w0[3], selector); w3[1] = __byte_perm (w0[1], w0[2], selector); w3[0] = __byte_perm (w0[0], w0[1], selector); w2[3] = __byte_perm ( 0, w0[0], selector); w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 12: w7[3] = __byte_perm (w4[2], w4[3], selector); w7[2] = __byte_perm (w4[1], w4[2], selector); w7[1] = __byte_perm (w4[0], w4[1], selector); w7[0] = __byte_perm (w3[3], w4[0], selector); w6[3] = __byte_perm (w3[2], w3[3], selector); w6[2] = __byte_perm (w3[1], w3[2], selector); w6[1] = __byte_perm (w3[0], w3[1], selector); w6[0] = __byte_perm (w2[3], w3[0], selector); w5[3] = __byte_perm (w2[2], w2[3], selector); w5[2] = __byte_perm (w2[1], w2[2], selector); w5[1] = __byte_perm (w2[0], w2[1], selector); w5[0] = __byte_perm (w1[3], w2[0], selector); w4[3] = __byte_perm (w1[2], w1[3], selector); w4[2] = __byte_perm (w1[1], w1[2], selector); w4[1] = __byte_perm (w1[0], w1[1], selector); w4[0] = __byte_perm (w0[3], w1[0], selector); w3[3] = __byte_perm (w0[2], w0[3], selector); w3[2] = __byte_perm (w0[1], w0[2], selector); w3[1] = __byte_perm (w0[0], w0[1], selector); w3[0] = __byte_perm ( 0, w0[0], selector); w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 13: w7[3] = __byte_perm (w4[1], w4[2], selector); w7[2] = __byte_perm (w4[0], w4[1], selector); w7[1] = __byte_perm (w3[3], w4[0], selector); w7[0] = __byte_perm (w3[2], w3[3], selector); w6[3] = __byte_perm (w3[1], w3[2], selector); w6[2] = __byte_perm (w3[0], w3[1], selector); w6[1] = __byte_perm (w2[3], w3[0], selector); w6[0] = __byte_perm (w2[2], w2[3], selector); w5[3] = __byte_perm (w2[1], w2[2], selector); w5[2] = __byte_perm (w2[0], w2[1], selector); w5[1] = __byte_perm (w1[3], w2[0], selector); w5[0] = __byte_perm (w1[2], w1[3], selector); w4[3] = __byte_perm (w1[1], w1[2], selector); w4[2] = __byte_perm (w1[0], w1[1], selector); w4[1] = __byte_perm (w0[3], w1[0], selector); w4[0] = __byte_perm (w0[2], w0[3], selector); w3[3] = __byte_perm (w0[1], w0[2], selector); w3[2] = __byte_perm (w0[0], w0[1], selector); w3[1] = __byte_perm ( 0, w0[0], selector); w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 14: w7[3] = __byte_perm (w4[0], w4[1], selector); w7[2] = __byte_perm (w3[3], w4[0], selector); w7[1] = __byte_perm (w3[2], w3[3], selector); w7[0] = __byte_perm (w3[1], w3[2], selector); w6[3] = __byte_perm (w3[0], w3[1], selector); w6[2] = __byte_perm (w2[3], w3[0], selector); w6[1] = __byte_perm (w2[2], w2[3], selector); w6[0] = __byte_perm (w2[1], w2[2], selector); w5[3] = __byte_perm (w2[0], w2[1], selector); w5[2] = __byte_perm (w1[3], w2[0], selector); w5[1] = __byte_perm (w1[2], w1[3], selector); w5[0] = __byte_perm (w1[1], w1[2], selector); w4[3] = __byte_perm (w1[0], w1[1], selector); w4[2] = __byte_perm (w0[3], w1[0], selector); w4[1] = __byte_perm (w0[2], w0[3], selector); w4[0] = __byte_perm (w0[1], w0[2], selector); w3[3] = __byte_perm (w0[0], w0[1], selector); w3[2] = __byte_perm ( 0, w0[0], selector); w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 15: w7[3] = __byte_perm (w3[3], w4[0], selector); w7[2] = __byte_perm (w3[2], w3[3], selector); w7[1] = __byte_perm (w3[1], w3[2], selector); w7[0] = __byte_perm (w3[0], w3[1], selector); w6[3] = __byte_perm (w2[3], w3[0], selector); w6[2] = __byte_perm (w2[2], w2[3], selector); w6[1] = __byte_perm (w2[1], w2[2], selector); w6[0] = __byte_perm (w2[0], w2[1], selector); w5[3] = __byte_perm (w1[3], w2[0], selector); w5[2] = __byte_perm (w1[2], w1[3], selector); w5[1] = __byte_perm (w1[1], w1[2], selector); w5[0] = __byte_perm (w1[0], w1[1], selector); w4[3] = __byte_perm (w0[3], w1[0], selector); w4[2] = __byte_perm (w0[2], w0[3], selector); w4[1] = __byte_perm (w0[1], w0[2], selector); w4[0] = __byte_perm (w0[0], w0[1], selector); w3[3] = __byte_perm ( 0, w0[0], selector); w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; } #endif } DECLSPEC void switch_buffer_by_offset_8x4_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32 offset) { const int offset_switch = offset / 4; #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC switch (offset_switch) { case 0: w7[3] = amd_bytealign (w7[2], w7[3], offset); w7[2] = amd_bytealign (w7[1], w7[2], offset); w7[1] = amd_bytealign (w7[0], w7[1], offset); w7[0] = amd_bytealign (w6[3], w7[0], offset); w6[3] = amd_bytealign (w6[2], w6[3], offset); w6[2] = amd_bytealign (w6[1], w6[2], offset); w6[1] = amd_bytealign (w6[0], w6[1], offset); w6[0] = amd_bytealign (w5[3], w6[0], offset); w5[3] = amd_bytealign (w5[2], w5[3], offset); w5[2] = amd_bytealign (w5[1], w5[2], offset); w5[1] = amd_bytealign (w5[0], w5[1], offset); w5[0] = amd_bytealign (w4[3], w5[0], offset); w4[3] = amd_bytealign (w4[2], w4[3], offset); w4[2] = amd_bytealign (w4[1], w4[2], offset); w4[1] = amd_bytealign (w4[0], w4[1], offset); w4[0] = amd_bytealign (w3[3], w4[0], offset); w3[3] = amd_bytealign (w3[2], w3[3], offset); w3[2] = amd_bytealign (w3[1], w3[2], offset); w3[1] = amd_bytealign (w3[0], w3[1], offset); w3[0] = amd_bytealign (w2[3], w3[0], offset); w2[3] = amd_bytealign (w2[2], w2[3], offset); w2[2] = amd_bytealign (w2[1], w2[2], offset); w2[1] = amd_bytealign (w2[0], w2[1], offset); w2[0] = amd_bytealign (w1[3], w2[0], offset); w1[3] = amd_bytealign (w1[2], w1[3], offset); w1[2] = amd_bytealign (w1[1], w1[2], offset); w1[1] = amd_bytealign (w1[0], w1[1], offset); w1[0] = amd_bytealign (w0[3], w1[0], offset); w0[3] = amd_bytealign (w0[2], w0[3], offset); w0[2] = amd_bytealign (w0[1], w0[2], offset); w0[1] = amd_bytealign (w0[0], w0[1], offset); w0[0] = amd_bytealign ( 0, w0[0], offset); break; case 1: w7[3] = amd_bytealign (w7[1], w7[2], offset); w7[2] = amd_bytealign (w7[0], w7[1], offset); w7[1] = amd_bytealign (w6[3], w7[0], offset); w7[0] = amd_bytealign (w6[2], w6[3], offset); w6[3] = amd_bytealign (w6[1], w6[2], offset); w6[2] = amd_bytealign (w6[0], w6[1], offset); w6[1] = amd_bytealign (w5[3], w6[0], offset); w6[0] = amd_bytealign (w5[2], w5[3], offset); w5[3] = amd_bytealign (w5[1], w5[2], offset); w5[2] = amd_bytealign (w5[0], w5[1], offset); w5[1] = amd_bytealign (w4[3], w5[0], offset); w5[0] = amd_bytealign (w4[2], w4[3], offset); w4[3] = amd_bytealign (w4[1], w4[2], offset); w4[2] = amd_bytealign (w4[0], w4[1], offset); w4[1] = amd_bytealign (w3[3], w4[0], offset); w4[0] = amd_bytealign (w3[2], w3[3], offset); w3[3] = amd_bytealign (w3[1], w3[2], offset); w3[2] = amd_bytealign (w3[0], w3[1], offset); w3[1] = amd_bytealign (w2[3], w3[0], offset); w3[0] = amd_bytealign (w2[2], w2[3], offset); w2[3] = amd_bytealign (w2[1], w2[2], offset); w2[2] = amd_bytealign (w2[0], w2[1], offset); w2[1] = amd_bytealign (w1[3], w2[0], offset); w2[0] = amd_bytealign (w1[2], w1[3], offset); w1[3] = amd_bytealign (w1[1], w1[2], offset); w1[2] = amd_bytealign (w1[0], w1[1], offset); w1[1] = amd_bytealign (w0[3], w1[0], offset); w1[0] = amd_bytealign (w0[2], w0[3], offset); w0[3] = amd_bytealign (w0[1], w0[2], offset); w0[2] = amd_bytealign (w0[0], w0[1], offset); w0[1] = amd_bytealign ( 0, w0[0], offset); w0[0] = 0; break; case 2: w7[3] = amd_bytealign (w7[0], w7[1], offset); w7[2] = amd_bytealign (w6[3], w7[0], offset); w7[1] = amd_bytealign (w6[2], w6[3], offset); w7[0] = amd_bytealign (w6[1], w6[2], offset); w6[3] = amd_bytealign (w6[0], w6[1], offset); w6[2] = amd_bytealign (w5[3], w6[0], offset); w6[1] = amd_bytealign (w5[2], w5[3], offset); w6[0] = amd_bytealign (w5[1], w5[2], offset); w5[3] = amd_bytealign (w5[0], w5[1], offset); w5[2] = amd_bytealign (w4[3], w5[0], offset); w5[1] = amd_bytealign (w4[2], w4[3], offset); w5[0] = amd_bytealign (w4[1], w4[2], offset); w4[3] = amd_bytealign (w4[0], w4[1], offset); w4[2] = amd_bytealign (w3[3], w4[0], offset); w4[1] = amd_bytealign (w3[2], w3[3], offset); w4[0] = amd_bytealign (w3[1], w3[2], offset); w3[3] = amd_bytealign (w3[0], w3[1], offset); w3[2] = amd_bytealign (w2[3], w3[0], offset); w3[1] = amd_bytealign (w2[2], w2[3], offset); w3[0] = amd_bytealign (w2[1], w2[2], offset); w2[3] = amd_bytealign (w2[0], w2[1], offset); w2[2] = amd_bytealign (w1[3], w2[0], offset); w2[1] = amd_bytealign (w1[2], w1[3], offset); w2[0] = amd_bytealign (w1[1], w1[2], offset); w1[3] = amd_bytealign (w1[0], w1[1], offset); w1[2] = amd_bytealign (w0[3], w1[0], offset); w1[1] = amd_bytealign (w0[2], w0[3], offset); w1[0] = amd_bytealign (w0[1], w0[2], offset); w0[3] = amd_bytealign (w0[0], w0[1], offset); w0[2] = amd_bytealign ( 0, w0[0], offset); w0[1] = 0; w0[0] = 0; break; case 3: w7[3] = amd_bytealign (w6[3], w7[0], offset); w7[2] = amd_bytealign (w6[2], w6[3], offset); w7[1] = amd_bytealign (w6[1], w6[2], offset); w7[0] = amd_bytealign (w6[0], w6[1], offset); w6[3] = amd_bytealign (w5[3], w6[0], offset); w6[2] = amd_bytealign (w5[2], w5[3], offset); w6[1] = amd_bytealign (w5[1], w5[2], offset); w6[0] = amd_bytealign (w5[0], w5[1], offset); w5[3] = amd_bytealign (w4[3], w5[0], offset); w5[2] = amd_bytealign (w4[2], w4[3], offset); w5[1] = amd_bytealign (w4[1], w4[2], offset); w5[0] = amd_bytealign (w4[0], w4[1], offset); w4[3] = amd_bytealign (w3[3], w4[0], offset); w4[2] = amd_bytealign (w3[2], w3[3], offset); w4[1] = amd_bytealign (w3[1], w3[2], offset); w4[0] = amd_bytealign (w3[0], w3[1], offset); w3[3] = amd_bytealign (w2[3], w3[0], offset); w3[2] = amd_bytealign (w2[2], w2[3], offset); w3[1] = amd_bytealign (w2[1], w2[2], offset); w3[0] = amd_bytealign (w2[0], w2[1], offset); w2[3] = amd_bytealign (w1[3], w2[0], offset); w2[2] = amd_bytealign (w1[2], w1[3], offset); w2[1] = amd_bytealign (w1[1], w1[2], offset); w2[0] = amd_bytealign (w1[0], w1[1], offset); w1[3] = amd_bytealign (w0[3], w1[0], offset); w1[2] = amd_bytealign (w0[2], w0[3], offset); w1[1] = amd_bytealign (w0[1], w0[2], offset); w1[0] = amd_bytealign (w0[0], w0[1], offset); w0[3] = amd_bytealign ( 0, w0[0], offset); w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 4: w7[3] = amd_bytealign (w6[2], w6[3], offset); w7[2] = amd_bytealign (w6[1], w6[2], offset); w7[1] = amd_bytealign (w6[0], w6[1], offset); w7[0] = amd_bytealign (w5[3], w6[0], offset); w6[3] = amd_bytealign (w5[2], w5[3], offset); w6[2] = amd_bytealign (w5[1], w5[2], offset); w6[1] = amd_bytealign (w5[0], w5[1], offset); w6[0] = amd_bytealign (w4[3], w5[0], offset); w5[3] = amd_bytealign (w4[2], w4[3], offset); w5[2] = amd_bytealign (w4[1], w4[2], offset); w5[1] = amd_bytealign (w4[0], w4[1], offset); w5[0] = amd_bytealign (w3[3], w4[0], offset); w4[3] = amd_bytealign (w3[2], w3[3], offset); w4[2] = amd_bytealign (w3[1], w3[2], offset); w4[1] = amd_bytealign (w3[0], w3[1], offset); w4[0] = amd_bytealign (w2[3], w3[0], offset); w3[3] = amd_bytealign (w2[2], w2[3], offset); w3[2] = amd_bytealign (w2[1], w2[2], offset); w3[1] = amd_bytealign (w2[0], w2[1], offset); w3[0] = amd_bytealign (w1[3], w2[0], offset); w2[3] = amd_bytealign (w1[2], w1[3], offset); w2[2] = amd_bytealign (w1[1], w1[2], offset); w2[1] = amd_bytealign (w1[0], w1[1], offset); w2[0] = amd_bytealign (w0[3], w1[0], offset); w1[3] = amd_bytealign (w0[2], w0[3], offset); w1[2] = amd_bytealign (w0[1], w0[2], offset); w1[1] = amd_bytealign (w0[0], w0[1], offset); w1[0] = amd_bytealign ( 0, w0[0], offset); w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 5: w7[3] = amd_bytealign (w6[1], w6[2], offset); w7[2] = amd_bytealign (w6[0], w6[1], offset); w7[1] = amd_bytealign (w5[3], w6[0], offset); w7[0] = amd_bytealign (w5[2], w5[3], offset); w6[3] = amd_bytealign (w5[1], w5[2], offset); w6[2] = amd_bytealign (w5[0], w5[1], offset); w6[1] = amd_bytealign (w4[3], w5[0], offset); w6[0] = amd_bytealign (w4[2], w4[3], offset); w5[3] = amd_bytealign (w4[1], w4[2], offset); w5[2] = amd_bytealign (w4[0], w4[1], offset); w5[1] = amd_bytealign (w3[3], w4[0], offset); w5[0] = amd_bytealign (w3[2], w3[3], offset); w4[3] = amd_bytealign (w3[1], w3[2], offset); w4[2] = amd_bytealign (w3[0], w3[1], offset); w4[1] = amd_bytealign (w2[3], w3[0], offset); w4[0] = amd_bytealign (w2[2], w2[3], offset); w3[3] = amd_bytealign (w2[1], w2[2], offset); w3[2] = amd_bytealign (w2[0], w2[1], offset); w3[1] = amd_bytealign (w1[3], w2[0], offset); w3[0] = amd_bytealign (w1[2], w1[3], offset); w2[3] = amd_bytealign (w1[1], w1[2], offset); w2[2] = amd_bytealign (w1[0], w1[1], offset); w2[1] = amd_bytealign (w0[3], w1[0], offset); w2[0] = amd_bytealign (w0[2], w0[3], offset); w1[3] = amd_bytealign (w0[1], w0[2], offset); w1[2] = amd_bytealign (w0[0], w0[1], offset); w1[1] = amd_bytealign ( 0, w0[0], offset); w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 6: w7[3] = amd_bytealign (w6[0], w6[1], offset); w7[2] = amd_bytealign (w5[3], w6[0], offset); w7[1] = amd_bytealign (w5[2], w5[3], offset); w7[0] = amd_bytealign (w5[1], w5[2], offset); w6[3] = amd_bytealign (w5[0], w5[1], offset); w6[2] = amd_bytealign (w4[3], w5[0], offset); w6[1] = amd_bytealign (w4[2], w4[3], offset); w6[0] = amd_bytealign (w4[1], w4[2], offset); w5[3] = amd_bytealign (w4[0], w4[1], offset); w5[2] = amd_bytealign (w3[3], w4[0], offset); w5[1] = amd_bytealign (w3[2], w3[3], offset); w5[0] = amd_bytealign (w3[1], w3[2], offset); w4[3] = amd_bytealign (w3[0], w3[1], offset); w4[2] = amd_bytealign (w2[3], w3[0], offset); w4[1] = amd_bytealign (w2[2], w2[3], offset); w4[0] = amd_bytealign (w2[1], w2[2], offset); w3[3] = amd_bytealign (w2[0], w2[1], offset); w3[2] = amd_bytealign (w1[3], w2[0], offset); w3[1] = amd_bytealign (w1[2], w1[3], offset); w3[0] = amd_bytealign (w1[1], w1[2], offset); w2[3] = amd_bytealign (w1[0], w1[1], offset); w2[2] = amd_bytealign (w0[3], w1[0], offset); w2[1] = amd_bytealign (w0[2], w0[3], offset); w2[0] = amd_bytealign (w0[1], w0[2], offset); w1[3] = amd_bytealign (w0[0], w0[1], offset); w1[2] = amd_bytealign ( 0, w0[0], offset); w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 7: w7[3] = amd_bytealign (w5[3], w6[0], offset); w7[2] = amd_bytealign (w5[2], w5[3], offset); w7[1] = amd_bytealign (w5[1], w5[2], offset); w7[0] = amd_bytealign (w5[0], w5[1], offset); w6[3] = amd_bytealign (w4[3], w5[0], offset); w6[2] = amd_bytealign (w4[2], w4[3], offset); w6[1] = amd_bytealign (w4[1], w4[2], offset); w6[0] = amd_bytealign (w4[0], w4[1], offset); w5[3] = amd_bytealign (w3[3], w4[0], offset); w5[2] = amd_bytealign (w3[2], w3[3], offset); w5[1] = amd_bytealign (w3[1], w3[2], offset); w5[0] = amd_bytealign (w3[0], w3[1], offset); w4[3] = amd_bytealign (w2[3], w3[0], offset); w4[2] = amd_bytealign (w2[2], w2[3], offset); w4[1] = amd_bytealign (w2[1], w2[2], offset); w4[0] = amd_bytealign (w2[0], w2[1], offset); w3[3] = amd_bytealign (w1[3], w2[0], offset); w3[2] = amd_bytealign (w1[2], w1[3], offset); w3[1] = amd_bytealign (w1[1], w1[2], offset); w3[0] = amd_bytealign (w1[0], w1[1], offset); w2[3] = amd_bytealign (w0[3], w1[0], offset); w2[2] = amd_bytealign (w0[2], w0[3], offset); w2[1] = amd_bytealign (w0[1], w0[2], offset); w2[0] = amd_bytealign (w0[0], w0[1], offset); w1[3] = amd_bytealign ( 0, w0[0], offset); w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 8: w7[3] = amd_bytealign (w5[2], w5[3], offset); w7[2] = amd_bytealign (w5[1], w5[2], offset); w7[1] = amd_bytealign (w5[0], w5[1], offset); w7[0] = amd_bytealign (w4[3], w5[0], offset); w6[3] = amd_bytealign (w4[2], w4[3], offset); w6[2] = amd_bytealign (w4[1], w4[2], offset); w6[1] = amd_bytealign (w4[0], w4[1], offset); w6[0] = amd_bytealign (w3[3], w4[0], offset); w5[3] = amd_bytealign (w3[2], w3[3], offset); w5[2] = amd_bytealign (w3[1], w3[2], offset); w5[1] = amd_bytealign (w3[0], w3[1], offset); w5[0] = amd_bytealign (w2[3], w3[0], offset); w4[3] = amd_bytealign (w2[2], w2[3], offset); w4[2] = amd_bytealign (w2[1], w2[2], offset); w4[1] = amd_bytealign (w2[0], w2[1], offset); w4[0] = amd_bytealign (w1[3], w2[0], offset); w3[3] = amd_bytealign (w1[2], w1[3], offset); w3[2] = amd_bytealign (w1[1], w1[2], offset); w3[1] = amd_bytealign (w1[0], w1[1], offset); w3[0] = amd_bytealign (w0[3], w1[0], offset); w2[3] = amd_bytealign (w0[2], w0[3], offset); w2[2] = amd_bytealign (w0[1], w0[2], offset); w2[1] = amd_bytealign (w0[0], w0[1], offset); w2[0] = amd_bytealign ( 0, w0[0], offset); w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 9: w7[3] = amd_bytealign (w5[1], w5[2], offset); w7[2] = amd_bytealign (w5[0], w5[1], offset); w7[1] = amd_bytealign (w4[3], w5[0], offset); w7[0] = amd_bytealign (w4[2], w4[3], offset); w6[3] = amd_bytealign (w4[1], w4[2], offset); w6[2] = amd_bytealign (w4[0], w4[1], offset); w6[1] = amd_bytealign (w3[3], w4[0], offset); w6[0] = amd_bytealign (w3[2], w3[3], offset); w5[3] = amd_bytealign (w3[1], w3[2], offset); w5[2] = amd_bytealign (w3[0], w3[1], offset); w5[1] = amd_bytealign (w2[3], w3[0], offset); w5[0] = amd_bytealign (w2[2], w2[3], offset); w4[3] = amd_bytealign (w2[1], w2[2], offset); w4[2] = amd_bytealign (w2[0], w2[1], offset); w4[1] = amd_bytealign (w1[3], w2[0], offset); w4[0] = amd_bytealign (w1[2], w1[3], offset); w3[3] = amd_bytealign (w1[1], w1[2], offset); w3[2] = amd_bytealign (w1[0], w1[1], offset); w3[1] = amd_bytealign (w0[3], w1[0], offset); w3[0] = amd_bytealign (w0[2], w0[3], offset); w2[3] = amd_bytealign (w0[1], w0[2], offset); w2[2] = amd_bytealign (w0[0], w0[1], offset); w2[1] = amd_bytealign ( 0, w0[0], offset); w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 10: w7[3] = amd_bytealign (w5[0], w5[1], offset); w7[2] = amd_bytealign (w4[3], w5[0], offset); w7[1] = amd_bytealign (w4[2], w4[3], offset); w7[0] = amd_bytealign (w4[1], w4[2], offset); w6[3] = amd_bytealign (w4[0], w4[1], offset); w6[2] = amd_bytealign (w3[3], w4[0], offset); w6[1] = amd_bytealign (w3[2], w3[3], offset); w6[0] = amd_bytealign (w3[1], w3[2], offset); w5[3] = amd_bytealign (w3[0], w3[1], offset); w5[2] = amd_bytealign (w2[3], w3[0], offset); w5[1] = amd_bytealign (w2[2], w2[3], offset); w5[0] = amd_bytealign (w2[1], w2[2], offset); w4[3] = amd_bytealign (w2[0], w2[1], offset); w4[2] = amd_bytealign (w1[3], w2[0], offset); w4[1] = amd_bytealign (w1[2], w1[3], offset); w4[0] = amd_bytealign (w1[1], w1[2], offset); w3[3] = amd_bytealign (w1[0], w1[1], offset); w3[2] = amd_bytealign (w0[3], w1[0], offset); w3[1] = amd_bytealign (w0[2], w0[3], offset); w3[0] = amd_bytealign (w0[1], w0[2], offset); w2[3] = amd_bytealign (w0[0], w0[1], offset); w2[2] = amd_bytealign ( 0, w0[0], offset); w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 11: w7[3] = amd_bytealign (w4[3], w5[0], offset); w7[2] = amd_bytealign (w4[2], w4[3], offset); w7[1] = amd_bytealign (w4[1], w4[2], offset); w7[0] = amd_bytealign (w4[0], w4[1], offset); w6[3] = amd_bytealign (w3[3], w4[0], offset); w6[2] = amd_bytealign (w3[2], w3[3], offset); w6[1] = amd_bytealign (w3[1], w3[2], offset); w6[0] = amd_bytealign (w3[0], w3[1], offset); w5[3] = amd_bytealign (w2[3], w3[0], offset); w5[2] = amd_bytealign (w2[2], w2[3], offset); w5[1] = amd_bytealign (w2[1], w2[2], offset); w5[0] = amd_bytealign (w2[0], w2[1], offset); w4[3] = amd_bytealign (w1[3], w2[0], offset); w4[2] = amd_bytealign (w1[2], w1[3], offset); w4[1] = amd_bytealign (w1[1], w1[2], offset); w4[0] = amd_bytealign (w1[0], w1[1], offset); w3[3] = amd_bytealign (w0[3], w1[0], offset); w3[2] = amd_bytealign (w0[2], w0[3], offset); w3[1] = amd_bytealign (w0[1], w0[2], offset); w3[0] = amd_bytealign (w0[0], w0[1], offset); w2[3] = amd_bytealign ( 0, w0[0], offset); w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 12: w7[3] = amd_bytealign (w4[2], w4[3], offset); w7[2] = amd_bytealign (w4[1], w4[2], offset); w7[1] = amd_bytealign (w4[0], w4[1], offset); w7[0] = amd_bytealign (w3[3], w4[0], offset); w6[3] = amd_bytealign (w3[2], w3[3], offset); w6[2] = amd_bytealign (w3[1], w3[2], offset); w6[1] = amd_bytealign (w3[0], w3[1], offset); w6[0] = amd_bytealign (w2[3], w3[0], offset); w5[3] = amd_bytealign (w2[2], w2[3], offset); w5[2] = amd_bytealign (w2[1], w2[2], offset); w5[1] = amd_bytealign (w2[0], w2[1], offset); w5[0] = amd_bytealign (w1[3], w2[0], offset); w4[3] = amd_bytealign (w1[2], w1[3], offset); w4[2] = amd_bytealign (w1[1], w1[2], offset); w4[1] = amd_bytealign (w1[0], w1[1], offset); w4[0] = amd_bytealign (w0[3], w1[0], offset); w3[3] = amd_bytealign (w0[2], w0[3], offset); w3[2] = amd_bytealign (w0[1], w0[2], offset); w3[1] = amd_bytealign (w0[0], w0[1], offset); w3[0] = amd_bytealign ( 0, w0[0], offset); w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 13: w7[3] = amd_bytealign (w4[1], w4[2], offset); w7[2] = amd_bytealign (w4[0], w4[1], offset); w7[1] = amd_bytealign (w3[3], w4[0], offset); w7[0] = amd_bytealign (w3[2], w3[3], offset); w6[3] = amd_bytealign (w3[1], w3[2], offset); w6[2] = amd_bytealign (w3[0], w3[1], offset); w6[1] = amd_bytealign (w2[3], w3[0], offset); w6[0] = amd_bytealign (w2[2], w2[3], offset); w5[3] = amd_bytealign (w2[1], w2[2], offset); w5[2] = amd_bytealign (w2[0], w2[1], offset); w5[1] = amd_bytealign (w1[3], w2[0], offset); w5[0] = amd_bytealign (w1[2], w1[3], offset); w4[3] = amd_bytealign (w1[1], w1[2], offset); w4[2] = amd_bytealign (w1[0], w1[1], offset); w4[1] = amd_bytealign (w0[3], w1[0], offset); w4[0] = amd_bytealign (w0[2], w0[3], offset); w3[3] = amd_bytealign (w0[1], w0[2], offset); w3[2] = amd_bytealign (w0[0], w0[1], offset); w3[1] = amd_bytealign ( 0, w0[0], offset); w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 14: w7[3] = amd_bytealign (w4[0], w4[1], offset); w7[2] = amd_bytealign (w3[3], w4[0], offset); w7[1] = amd_bytealign (w3[2], w3[3], offset); w7[0] = amd_bytealign (w3[1], w3[2], offset); w6[3] = amd_bytealign (w3[0], w3[1], offset); w6[2] = amd_bytealign (w2[3], w3[0], offset); w6[1] = amd_bytealign (w2[2], w2[3], offset); w6[0] = amd_bytealign (w2[1], w2[2], offset); w5[3] = amd_bytealign (w2[0], w2[1], offset); w5[2] = amd_bytealign (w1[3], w2[0], offset); w5[1] = amd_bytealign (w1[2], w1[3], offset); w5[0] = amd_bytealign (w1[1], w1[2], offset); w4[3] = amd_bytealign (w1[0], w1[1], offset); w4[2] = amd_bytealign (w0[3], w1[0], offset); w4[1] = amd_bytealign (w0[2], w0[3], offset); w4[0] = amd_bytealign (w0[1], w0[2], offset); w3[3] = amd_bytealign (w0[0], w0[1], offset); w3[2] = amd_bytealign ( 0, w0[0], offset); w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 15: w7[3] = amd_bytealign (w3[3], w4[0], offset); w7[2] = amd_bytealign (w3[2], w3[3], offset); w7[1] = amd_bytealign (w3[1], w3[2], offset); w7[0] = amd_bytealign (w3[0], w3[1], offset); w6[3] = amd_bytealign (w2[3], w3[0], offset); w6[2] = amd_bytealign (w2[2], w2[3], offset); w6[1] = amd_bytealign (w2[1], w2[2], offset); w6[0] = amd_bytealign (w2[0], w2[1], offset); w5[3] = amd_bytealign (w1[3], w2[0], offset); w5[2] = amd_bytealign (w1[2], w1[3], offset); w5[1] = amd_bytealign (w1[1], w1[2], offset); w5[0] = amd_bytealign (w1[0], w1[1], offset); w4[3] = amd_bytealign (w0[3], w1[0], offset); w4[2] = amd_bytealign (w0[2], w0[3], offset); w4[1] = amd_bytealign (w0[1], w0[2], offset); w4[0] = amd_bytealign (w0[0], w0[1], offset); w3[3] = amd_bytealign ( 0, w0[0], offset); w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 16: w7[3] = amd_bytealign (w3[2], w3[3], offset); w7[2] = amd_bytealign (w3[1], w3[2], offset); w7[1] = amd_bytealign (w3[0], w3[1], offset); w7[0] = amd_bytealign (w2[3], w3[0], offset); w6[3] = amd_bytealign (w2[2], w2[3], offset); w6[2] = amd_bytealign (w2[1], w2[2], offset); w6[1] = amd_bytealign (w2[0], w2[1], offset); w6[0] = amd_bytealign (w1[3], w2[0], offset); w5[3] = amd_bytealign (w1[2], w1[3], offset); w5[2] = amd_bytealign (w1[1], w1[2], offset); w5[1] = amd_bytealign (w1[0], w1[1], offset); w5[0] = amd_bytealign (w0[3], w1[0], offset); w4[3] = amd_bytealign (w0[2], w0[3], offset); w4[2] = amd_bytealign (w0[1], w0[2], offset); w4[1] = amd_bytealign (w0[0], w0[1], offset); w4[0] = amd_bytealign ( 0, w0[0], offset); w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 17: w7[3] = amd_bytealign (w3[1], w3[2], offset); w7[2] = amd_bytealign (w3[0], w3[1], offset); w7[1] = amd_bytealign (w2[3], w3[0], offset); w7[0] = amd_bytealign (w2[2], w2[3], offset); w6[3] = amd_bytealign (w2[1], w2[2], offset); w6[2] = amd_bytealign (w2[0], w2[1], offset); w6[1] = amd_bytealign (w1[3], w2[0], offset); w6[0] = amd_bytealign (w1[2], w1[3], offset); w5[3] = amd_bytealign (w1[1], w1[2], offset); w5[2] = amd_bytealign (w1[0], w1[1], offset); w5[1] = amd_bytealign (w0[3], w1[0], offset); w5[0] = amd_bytealign (w0[2], w0[3], offset); w4[3] = amd_bytealign (w0[1], w0[2], offset); w4[2] = amd_bytealign (w0[0], w0[1], offset); w4[1] = amd_bytealign ( 0, w0[0], offset); w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 18: w7[3] = amd_bytealign (w3[0], w3[1], offset); w7[2] = amd_bytealign (w2[3], w3[0], offset); w7[1] = amd_bytealign (w2[2], w2[3], offset); w7[0] = amd_bytealign (w2[1], w2[2], offset); w6[3] = amd_bytealign (w2[0], w2[1], offset); w6[2] = amd_bytealign (w1[3], w2[0], offset); w6[1] = amd_bytealign (w1[2], w1[3], offset); w6[0] = amd_bytealign (w1[1], w1[2], offset); w5[3] = amd_bytealign (w1[0], w1[1], offset); w5[2] = amd_bytealign (w0[3], w1[0], offset); w5[1] = amd_bytealign (w0[2], w0[3], offset); w5[0] = amd_bytealign (w0[1], w0[2], offset); w4[3] = amd_bytealign (w0[0], w0[1], offset); w4[2] = amd_bytealign ( 0, w0[0], offset); w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 19: w7[3] = amd_bytealign (w2[3], w3[0], offset); w7[2] = amd_bytealign (w2[2], w2[3], offset); w7[1] = amd_bytealign (w2[1], w2[2], offset); w7[0] = amd_bytealign (w2[0], w2[1], offset); w6[3] = amd_bytealign (w1[3], w2[0], offset); w6[2] = amd_bytealign (w1[2], w1[3], offset); w6[1] = amd_bytealign (w1[1], w1[2], offset); w6[0] = amd_bytealign (w1[0], w1[1], offset); w5[3] = amd_bytealign (w0[3], w1[0], offset); w5[2] = amd_bytealign (w0[2], w0[3], offset); w5[1] = amd_bytealign (w0[1], w0[2], offset); w5[0] = amd_bytealign (w0[0], w0[1], offset); w4[3] = amd_bytealign ( 0, w0[0], offset); w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 20: w7[3] = amd_bytealign (w2[2], w2[3], offset); w7[2] = amd_bytealign (w2[1], w2[2], offset); w7[1] = amd_bytealign (w2[0], w2[1], offset); w7[0] = amd_bytealign (w1[3], w2[0], offset); w6[3] = amd_bytealign (w1[2], w1[3], offset); w6[2] = amd_bytealign (w1[1], w1[2], offset); w6[1] = amd_bytealign (w1[0], w1[1], offset); w6[0] = amd_bytealign (w0[3], w1[0], offset); w5[3] = amd_bytealign (w0[2], w0[3], offset); w5[2] = amd_bytealign (w0[1], w0[2], offset); w5[1] = amd_bytealign (w0[0], w0[1], offset); w5[0] = amd_bytealign ( 0, w0[0], offset); w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 21: w7[3] = amd_bytealign (w2[1], w2[2], offset); w7[2] = amd_bytealign (w2[0], w2[1], offset); w7[1] = amd_bytealign (w1[3], w2[0], offset); w7[0] = amd_bytealign (w1[2], w1[3], offset); w6[3] = amd_bytealign (w1[1], w1[2], offset); w6[2] = amd_bytealign (w1[0], w1[1], offset); w6[1] = amd_bytealign (w0[3], w1[0], offset); w6[0] = amd_bytealign (w0[2], w0[3], offset); w5[3] = amd_bytealign (w0[1], w0[2], offset); w5[2] = amd_bytealign (w0[0], w0[1], offset); w5[1] = amd_bytealign ( 0, w0[0], offset); w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 22: w7[3] = amd_bytealign (w2[0], w2[1], offset); w7[2] = amd_bytealign (w1[3], w2[0], offset); w7[1] = amd_bytealign (w1[2], w1[3], offset); w7[0] = amd_bytealign (w1[1], w1[2], offset); w6[3] = amd_bytealign (w1[0], w1[1], offset); w6[2] = amd_bytealign (w0[3], w1[0], offset); w6[1] = amd_bytealign (w0[2], w0[3], offset); w6[0] = amd_bytealign (w0[1], w0[2], offset); w5[3] = amd_bytealign (w0[0], w0[1], offset); w5[2] = amd_bytealign ( 0, w0[0], offset); w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 23: w7[3] = amd_bytealign (w1[3], w2[0], offset); w7[2] = amd_bytealign (w1[2], w1[3], offset); w7[1] = amd_bytealign (w1[1], w1[2], offset); w7[0] = amd_bytealign (w1[0], w1[1], offset); w6[3] = amd_bytealign (w0[3], w1[0], offset); w6[2] = amd_bytealign (w0[2], w0[3], offset); w6[1] = amd_bytealign (w0[1], w0[2], offset); w6[0] = amd_bytealign (w0[0], w0[1], offset); w5[3] = amd_bytealign ( 0, w0[0], offset); w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 24: w7[3] = amd_bytealign (w1[2], w1[3], offset); w7[2] = amd_bytealign (w1[1], w1[2], offset); w7[1] = amd_bytealign (w1[0], w1[1], offset); w7[0] = amd_bytealign (w0[3], w1[0], offset); w6[3] = amd_bytealign (w0[2], w0[3], offset); w6[2] = amd_bytealign (w0[1], w0[2], offset); w6[1] = amd_bytealign (w0[0], w0[1], offset); w6[0] = amd_bytealign ( 0, w0[0], offset); w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 25: w7[3] = amd_bytealign (w1[1], w1[2], offset); w7[2] = amd_bytealign (w1[0], w1[1], offset); w7[1] = amd_bytealign (w0[3], w1[0], offset); w7[0] = amd_bytealign (w0[2], w0[3], offset); w6[3] = amd_bytealign (w0[1], w0[2], offset); w6[2] = amd_bytealign (w0[0], w0[1], offset); w6[1] = amd_bytealign ( 0, w0[0], offset); w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 26: w7[3] = amd_bytealign (w1[0], w1[1], offset); w7[2] = amd_bytealign (w0[3], w1[0], offset); w7[1] = amd_bytealign (w0[2], w0[3], offset); w7[0] = amd_bytealign (w0[1], w0[2], offset); w6[3] = amd_bytealign (w0[0], w0[1], offset); w6[2] = amd_bytealign ( 0, w0[0], offset); w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 27: w7[3] = amd_bytealign (w0[3], w1[0], offset); w7[2] = amd_bytealign (w0[2], w0[3], offset); w7[1] = amd_bytealign (w0[1], w0[2], offset); w7[0] = amd_bytealign (w0[0], w0[1], offset); w6[3] = amd_bytealign ( 0, w0[0], offset); w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 28: w7[3] = amd_bytealign (w0[2], w0[3], offset); w7[2] = amd_bytealign (w0[1], w0[2], offset); w7[1] = amd_bytealign (w0[0], w0[1], offset); w7[0] = amd_bytealign ( 0, w0[0], offset); w6[3] = 0; w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 29: w7[3] = amd_bytealign (w0[1], w0[2], offset); w7[2] = amd_bytealign (w0[0], w0[1], offset); w7[1] = amd_bytealign ( 0, w0[0], offset); w7[0] = 0; w6[3] = 0; w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 30: w7[3] = amd_bytealign (w0[0], w0[1], offset); w7[2] = amd_bytealign ( 0, w0[0], offset); w7[1] = 0; w7[0] = 0; w6[3] = 0; w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 31: w7[3] = amd_bytealign ( 0, w0[0], offset); w7[2] = 0; w7[1] = 0; w7[0] = 0; w6[3] = 0; w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; } #endif #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV #if defined IS_NV const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; #endif #if defined IS_AMD const int selector = 0x0706050403020100 >> ((offset & 3) * 8); #endif switch (offset_switch) { case 0: w7[3] = __byte_perm (w7[3], w7[2], selector); w7[2] = __byte_perm (w7[2], w7[1], selector); w7[1] = __byte_perm (w7[1], w7[0], selector); w7[0] = __byte_perm (w7[0], w6[3], selector); w6[3] = __byte_perm (w6[3], w6[2], selector); w6[2] = __byte_perm (w6[2], w6[1], selector); w6[1] = __byte_perm (w6[1], w6[0], selector); w6[0] = __byte_perm (w6[0], w5[3], selector); w5[3] = __byte_perm (w5[3], w5[2], selector); w5[2] = __byte_perm (w5[2], w5[1], selector); w5[1] = __byte_perm (w5[1], w5[0], selector); w5[0] = __byte_perm (w5[0], w4[3], selector); w4[3] = __byte_perm (w4[3], w4[2], selector); w4[2] = __byte_perm (w4[2], w4[1], selector); w4[1] = __byte_perm (w4[1], w4[0], selector); w4[0] = __byte_perm (w4[0], w3[3], selector); w3[3] = __byte_perm (w3[3], w3[2], selector); w3[2] = __byte_perm (w3[2], w3[1], selector); w3[1] = __byte_perm (w3[1], w3[0], selector); w3[0] = __byte_perm (w3[0], w2[3], selector); w2[3] = __byte_perm (w2[3], w2[2], selector); w2[2] = __byte_perm (w2[2], w2[1], selector); w2[1] = __byte_perm (w2[1], w2[0], selector); w2[0] = __byte_perm (w2[0], w1[3], selector); w1[3] = __byte_perm (w1[3], w1[2], selector); w1[2] = __byte_perm (w1[2], w1[1], selector); w1[1] = __byte_perm (w1[1], w1[0], selector); w1[0] = __byte_perm (w1[0], w0[3], selector); w0[3] = __byte_perm (w0[3], w0[2], selector); w0[2] = __byte_perm (w0[2], w0[1], selector); w0[1] = __byte_perm (w0[1], w0[0], selector); w0[0] = __byte_perm (w0[0], 0, selector); break; case 1: w7[3] = __byte_perm (w7[2], w7[1], selector); w7[2] = __byte_perm (w7[1], w7[0], selector); w7[1] = __byte_perm (w7[0], w6[3], selector); w7[0] = __byte_perm (w6[3], w6[2], selector); w6[3] = __byte_perm (w6[2], w6[1], selector); w6[2] = __byte_perm (w6[1], w6[0], selector); w6[1] = __byte_perm (w6[0], w5[3], selector); w6[0] = __byte_perm (w5[3], w5[2], selector); w5[3] = __byte_perm (w5[2], w5[1], selector); w5[2] = __byte_perm (w5[1], w5[0], selector); w5[1] = __byte_perm (w5[0], w4[3], selector); w5[0] = __byte_perm (w4[3], w4[2], selector); w4[3] = __byte_perm (w4[2], w4[1], selector); w4[2] = __byte_perm (w4[1], w4[0], selector); w4[1] = __byte_perm (w4[0], w3[3], selector); w4[0] = __byte_perm (w3[3], w3[2], selector); w3[3] = __byte_perm (w3[2], w3[1], selector); w3[2] = __byte_perm (w3[1], w3[0], selector); w3[1] = __byte_perm (w3[0], w2[3], selector); w3[0] = __byte_perm (w2[3], w2[2], selector); w2[3] = __byte_perm (w2[2], w2[1], selector); w2[2] = __byte_perm (w2[1], w2[0], selector); w2[1] = __byte_perm (w2[0], w1[3], selector); w2[0] = __byte_perm (w1[3], w1[2], selector); w1[3] = __byte_perm (w1[2], w1[1], selector); w1[2] = __byte_perm (w1[1], w1[0], selector); w1[1] = __byte_perm (w1[0], w0[3], selector); w1[0] = __byte_perm (w0[3], w0[2], selector); w0[3] = __byte_perm (w0[2], w0[1], selector); w0[2] = __byte_perm (w0[1], w0[0], selector); w0[1] = __byte_perm (w0[0], 0, selector); w0[0] = 0; break; case 2: w7[3] = __byte_perm (w7[1], w7[0], selector); w7[2] = __byte_perm (w7[0], w6[3], selector); w7[1] = __byte_perm (w6[3], w6[2], selector); w7[0] = __byte_perm (w6[2], w6[1], selector); w6[3] = __byte_perm (w6[1], w6[0], selector); w6[2] = __byte_perm (w6[0], w5[3], selector); w6[1] = __byte_perm (w5[3], w5[2], selector); w6[0] = __byte_perm (w5[2], w5[1], selector); w5[3] = __byte_perm (w5[1], w5[0], selector); w5[2] = __byte_perm (w5[0], w4[3], selector); w5[1] = __byte_perm (w4[3], w4[2], selector); w5[0] = __byte_perm (w4[2], w4[1], selector); w4[3] = __byte_perm (w4[1], w4[0], selector); w4[2] = __byte_perm (w4[0], w3[3], selector); w4[1] = __byte_perm (w3[3], w3[2], selector); w4[0] = __byte_perm (w3[2], w3[1], selector); w3[3] = __byte_perm (w3[1], w3[0], selector); w3[2] = __byte_perm (w3[0], w2[3], selector); w3[1] = __byte_perm (w2[3], w2[2], selector); w3[0] = __byte_perm (w2[2], w2[1], selector); w2[3] = __byte_perm (w2[1], w2[0], selector); w2[2] = __byte_perm (w2[0], w1[3], selector); w2[1] = __byte_perm (w1[3], w1[2], selector); w2[0] = __byte_perm (w1[2], w1[1], selector); w1[3] = __byte_perm (w1[1], w1[0], selector); w1[2] = __byte_perm (w1[0], w0[3], selector); w1[1] = __byte_perm (w0[3], w0[2], selector); w1[0] = __byte_perm (w0[2], w0[1], selector); w0[3] = __byte_perm (w0[1], w0[0], selector); w0[2] = __byte_perm (w0[0], 0, selector); w0[1] = 0; w0[0] = 0; break; case 3: w7[3] = __byte_perm (w7[0], w6[3], selector); w7[2] = __byte_perm (w6[3], w6[2], selector); w7[1] = __byte_perm (w6[2], w6[1], selector); w7[0] = __byte_perm (w6[1], w6[0], selector); w6[3] = __byte_perm (w6[0], w5[3], selector); w6[2] = __byte_perm (w5[3], w5[2], selector); w6[1] = __byte_perm (w5[2], w5[1], selector); w6[0] = __byte_perm (w5[1], w5[0], selector); w5[3] = __byte_perm (w5[0], w4[3], selector); w5[2] = __byte_perm (w4[3], w4[2], selector); w5[1] = __byte_perm (w4[2], w4[1], selector); w5[0] = __byte_perm (w4[1], w4[0], selector); w4[3] = __byte_perm (w4[0], w3[3], selector); w4[2] = __byte_perm (w3[3], w3[2], selector); w4[1] = __byte_perm (w3[2], w3[1], selector); w4[0] = __byte_perm (w3[1], w3[0], selector); w3[3] = __byte_perm (w3[0], w2[3], selector); w3[2] = __byte_perm (w2[3], w2[2], selector); w3[1] = __byte_perm (w2[2], w2[1], selector); w3[0] = __byte_perm (w2[1], w2[0], selector); w2[3] = __byte_perm (w2[0], w1[3], selector); w2[2] = __byte_perm (w1[3], w1[2], selector); w2[1] = __byte_perm (w1[2], w1[1], selector); w2[0] = __byte_perm (w1[1], w1[0], selector); w1[3] = __byte_perm (w1[0], w0[3], selector); w1[2] = __byte_perm (w0[3], w0[2], selector); w1[1] = __byte_perm (w0[2], w0[1], selector); w1[0] = __byte_perm (w0[1], w0[0], selector); w0[3] = __byte_perm (w0[0], 0, selector); w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 4: w7[3] = __byte_perm (w6[3], w6[2], selector); w7[2] = __byte_perm (w6[2], w6[1], selector); w7[1] = __byte_perm (w6[1], w6[0], selector); w7[0] = __byte_perm (w6[0], w5[3], selector); w6[3] = __byte_perm (w5[3], w5[2], selector); w6[2] = __byte_perm (w5[2], w5[1], selector); w6[1] = __byte_perm (w5[1], w5[0], selector); w6[0] = __byte_perm (w5[0], w4[3], selector); w5[3] = __byte_perm (w4[3], w4[2], selector); w5[2] = __byte_perm (w4[2], w4[1], selector); w5[1] = __byte_perm (w4[1], w4[0], selector); w5[0] = __byte_perm (w4[0], w3[3], selector); w4[3] = __byte_perm (w3[3], w3[2], selector); w4[2] = __byte_perm (w3[2], w3[1], selector); w4[1] = __byte_perm (w3[1], w3[0], selector); w4[0] = __byte_perm (w3[0], w2[3], selector); w3[3] = __byte_perm (w2[3], w2[2], selector); w3[2] = __byte_perm (w2[2], w2[1], selector); w3[1] = __byte_perm (w2[1], w2[0], selector); w3[0] = __byte_perm (w2[0], w1[3], selector); w2[3] = __byte_perm (w1[3], w1[2], selector); w2[2] = __byte_perm (w1[2], w1[1], selector); w2[1] = __byte_perm (w1[1], w1[0], selector); w2[0] = __byte_perm (w1[0], w0[3], selector); w1[3] = __byte_perm (w0[3], w0[2], selector); w1[2] = __byte_perm (w0[2], w0[1], selector); w1[1] = __byte_perm (w0[1], w0[0], selector); w1[0] = __byte_perm (w0[0], 0, selector); w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 5: w7[3] = __byte_perm (w6[2], w6[1], selector); w7[2] = __byte_perm (w6[1], w6[0], selector); w7[1] = __byte_perm (w6[0], w5[3], selector); w7[0] = __byte_perm (w5[3], w5[2], selector); w6[3] = __byte_perm (w5[2], w5[1], selector); w6[2] = __byte_perm (w5[1], w5[0], selector); w6[1] = __byte_perm (w5[0], w4[3], selector); w6[0] = __byte_perm (w4[3], w4[2], selector); w5[3] = __byte_perm (w4[2], w4[1], selector); w5[2] = __byte_perm (w4[1], w4[0], selector); w5[1] = __byte_perm (w4[0], w3[3], selector); w5[0] = __byte_perm (w3[3], w3[2], selector); w4[3] = __byte_perm (w3[2], w3[1], selector); w4[2] = __byte_perm (w3[1], w3[0], selector); w4[1] = __byte_perm (w3[0], w2[3], selector); w4[0] = __byte_perm (w2[3], w2[2], selector); w3[3] = __byte_perm (w2[2], w2[1], selector); w3[2] = __byte_perm (w2[1], w2[0], selector); w3[1] = __byte_perm (w2[0], w1[3], selector); w3[0] = __byte_perm (w1[3], w1[2], selector); w2[3] = __byte_perm (w1[2], w1[1], selector); w2[2] = __byte_perm (w1[1], w1[0], selector); w2[1] = __byte_perm (w1[0], w0[3], selector); w2[0] = __byte_perm (w0[3], w0[2], selector); w1[3] = __byte_perm (w0[2], w0[1], selector); w1[2] = __byte_perm (w0[1], w0[0], selector); w1[1] = __byte_perm (w0[0], 0, selector); w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 6: w7[3] = __byte_perm (w6[1], w6[0], selector); w7[2] = __byte_perm (w6[0], w5[3], selector); w7[1] = __byte_perm (w5[3], w5[2], selector); w7[0] = __byte_perm (w5[2], w5[1], selector); w6[3] = __byte_perm (w5[1], w5[0], selector); w6[2] = __byte_perm (w5[0], w4[3], selector); w6[1] = __byte_perm (w4[3], w4[2], selector); w6[0] = __byte_perm (w4[2], w4[1], selector); w5[3] = __byte_perm (w4[1], w4[0], selector); w5[2] = __byte_perm (w4[0], w3[3], selector); w5[1] = __byte_perm (w3[3], w3[2], selector); w5[0] = __byte_perm (w3[2], w3[1], selector); w4[3] = __byte_perm (w3[1], w3[0], selector); w4[2] = __byte_perm (w3[0], w2[3], selector); w4[1] = __byte_perm (w2[3], w2[2], selector); w4[0] = __byte_perm (w2[2], w2[1], selector); w3[3] = __byte_perm (w2[1], w2[0], selector); w3[2] = __byte_perm (w2[0], w1[3], selector); w3[1] = __byte_perm (w1[3], w1[2], selector); w3[0] = __byte_perm (w1[2], w1[1], selector); w2[3] = __byte_perm (w1[1], w1[0], selector); w2[2] = __byte_perm (w1[0], w0[3], selector); w2[1] = __byte_perm (w0[3], w0[2], selector); w2[0] = __byte_perm (w0[2], w0[1], selector); w1[3] = __byte_perm (w0[1], w0[0], selector); w1[2] = __byte_perm (w0[0], 0, selector); w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 7: w7[3] = __byte_perm (w6[0], w5[3], selector); w7[2] = __byte_perm (w5[3], w5[2], selector); w7[1] = __byte_perm (w5[2], w5[1], selector); w7[0] = __byte_perm (w5[1], w5[0], selector); w6[3] = __byte_perm (w5[0], w4[3], selector); w6[2] = __byte_perm (w4[3], w4[2], selector); w6[1] = __byte_perm (w4[2], w4[1], selector); w6[0] = __byte_perm (w4[1], w4[0], selector); w5[3] = __byte_perm (w4[0], w3[3], selector); w5[2] = __byte_perm (w3[3], w3[2], selector); w5[1] = __byte_perm (w3[2], w3[1], selector); w5[0] = __byte_perm (w3[1], w3[0], selector); w4[3] = __byte_perm (w3[0], w2[3], selector); w4[2] = __byte_perm (w2[3], w2[2], selector); w4[1] = __byte_perm (w2[2], w2[1], selector); w4[0] = __byte_perm (w2[1], w2[0], selector); w3[3] = __byte_perm (w2[0], w1[3], selector); w3[2] = __byte_perm (w1[3], w1[2], selector); w3[1] = __byte_perm (w1[2], w1[1], selector); w3[0] = __byte_perm (w1[1], w1[0], selector); w2[3] = __byte_perm (w1[0], w0[3], selector); w2[2] = __byte_perm (w0[3], w0[2], selector); w2[1] = __byte_perm (w0[2], w0[1], selector); w2[0] = __byte_perm (w0[1], w0[0], selector); w1[3] = __byte_perm (w0[0], 0, selector); w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 8: w7[3] = __byte_perm (w5[3], w5[2], selector); w7[2] = __byte_perm (w5[2], w5[1], selector); w7[1] = __byte_perm (w5[1], w5[0], selector); w7[0] = __byte_perm (w5[0], w4[3], selector); w6[3] = __byte_perm (w4[3], w4[2], selector); w6[2] = __byte_perm (w4[2], w4[1], selector); w6[1] = __byte_perm (w4[1], w4[0], selector); w6[0] = __byte_perm (w4[0], w3[3], selector); w5[3] = __byte_perm (w3[3], w3[2], selector); w5[2] = __byte_perm (w3[2], w3[1], selector); w5[1] = __byte_perm (w3[1], w3[0], selector); w5[0] = __byte_perm (w3[0], w2[3], selector); w4[3] = __byte_perm (w2[3], w2[2], selector); w4[2] = __byte_perm (w2[2], w2[1], selector); w4[1] = __byte_perm (w2[1], w2[0], selector); w4[0] = __byte_perm (w2[0], w1[3], selector); w3[3] = __byte_perm (w1[3], w1[2], selector); w3[2] = __byte_perm (w1[2], w1[1], selector); w3[1] = __byte_perm (w1[1], w1[0], selector); w3[0] = __byte_perm (w1[0], w0[3], selector); w2[3] = __byte_perm (w0[3], w0[2], selector); w2[2] = __byte_perm (w0[2], w0[1], selector); w2[1] = __byte_perm (w0[1], w0[0], selector); w2[0] = __byte_perm (w0[0], 0, selector); w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 9: w7[3] = __byte_perm (w5[2], w5[1], selector); w7[2] = __byte_perm (w5[1], w5[0], selector); w7[1] = __byte_perm (w5[0], w4[3], selector); w7[0] = __byte_perm (w4[3], w4[2], selector); w6[3] = __byte_perm (w4[2], w4[1], selector); w6[2] = __byte_perm (w4[1], w4[0], selector); w6[1] = __byte_perm (w4[0], w3[3], selector); w6[0] = __byte_perm (w3[3], w3[2], selector); w5[3] = __byte_perm (w3[2], w3[1], selector); w5[2] = __byte_perm (w3[1], w3[0], selector); w5[1] = __byte_perm (w3[0], w2[3], selector); w5[0] = __byte_perm (w2[3], w2[2], selector); w4[3] = __byte_perm (w2[2], w2[1], selector); w4[2] = __byte_perm (w2[1], w2[0], selector); w4[1] = __byte_perm (w2[0], w1[3], selector); w4[0] = __byte_perm (w1[3], w1[2], selector); w3[3] = __byte_perm (w1[2], w1[1], selector); w3[2] = __byte_perm (w1[1], w1[0], selector); w3[1] = __byte_perm (w1[0], w0[3], selector); w3[0] = __byte_perm (w0[3], w0[2], selector); w2[3] = __byte_perm (w0[2], w0[1], selector); w2[2] = __byte_perm (w0[1], w0[0], selector); w2[1] = __byte_perm (w0[0], 0, selector); w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 10: w7[3] = __byte_perm (w5[1], w5[0], selector); w7[2] = __byte_perm (w5[0], w4[3], selector); w7[1] = __byte_perm (w4[3], w4[2], selector); w7[0] = __byte_perm (w4[2], w4[1], selector); w6[3] = __byte_perm (w4[1], w4[0], selector); w6[2] = __byte_perm (w4[0], w3[3], selector); w6[1] = __byte_perm (w3[3], w3[2], selector); w6[0] = __byte_perm (w3[2], w3[1], selector); w5[3] = __byte_perm (w3[1], w3[0], selector); w5[2] = __byte_perm (w3[0], w2[3], selector); w5[1] = __byte_perm (w2[3], w2[2], selector); w5[0] = __byte_perm (w2[2], w2[1], selector); w4[3] = __byte_perm (w2[1], w2[0], selector); w4[2] = __byte_perm (w2[0], w1[3], selector); w4[1] = __byte_perm (w1[3], w1[2], selector); w4[0] = __byte_perm (w1[2], w1[1], selector); w3[3] = __byte_perm (w1[1], w1[0], selector); w3[2] = __byte_perm (w1[0], w0[3], selector); w3[1] = __byte_perm (w0[3], w0[2], selector); w3[0] = __byte_perm (w0[2], w0[1], selector); w2[3] = __byte_perm (w0[1], w0[0], selector); w2[2] = __byte_perm (w0[0], 0, selector); w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 11: w7[3] = __byte_perm (w5[0], w4[3], selector); w7[2] = __byte_perm (w4[3], w4[2], selector); w7[1] = __byte_perm (w4[2], w4[1], selector); w7[0] = __byte_perm (w4[1], w4[0], selector); w6[3] = __byte_perm (w4[0], w3[3], selector); w6[2] = __byte_perm (w3[3], w3[2], selector); w6[1] = __byte_perm (w3[2], w3[1], selector); w6[0] = __byte_perm (w3[1], w3[0], selector); w5[3] = __byte_perm (w3[0], w2[3], selector); w5[2] = __byte_perm (w2[3], w2[2], selector); w5[1] = __byte_perm (w2[2], w2[1], selector); w5[0] = __byte_perm (w2[1], w2[0], selector); w4[3] = __byte_perm (w2[0], w1[3], selector); w4[2] = __byte_perm (w1[3], w1[2], selector); w4[1] = __byte_perm (w1[2], w1[1], selector); w4[0] = __byte_perm (w1[1], w1[0], selector); w3[3] = __byte_perm (w1[0], w0[3], selector); w3[2] = __byte_perm (w0[3], w0[2], selector); w3[1] = __byte_perm (w0[2], w0[1], selector); w3[0] = __byte_perm (w0[1], w0[0], selector); w2[3] = __byte_perm (w0[0], 0, selector); w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 12: w7[3] = __byte_perm (w4[3], w4[2], selector); w7[2] = __byte_perm (w4[2], w4[1], selector); w7[1] = __byte_perm (w4[1], w4[0], selector); w7[0] = __byte_perm (w4[0], w3[3], selector); w6[3] = __byte_perm (w3[3], w3[2], selector); w6[2] = __byte_perm (w3[2], w3[1], selector); w6[1] = __byte_perm (w3[1], w3[0], selector); w6[0] = __byte_perm (w3[0], w2[3], selector); w5[3] = __byte_perm (w2[3], w2[2], selector); w5[2] = __byte_perm (w2[2], w2[1], selector); w5[1] = __byte_perm (w2[1], w2[0], selector); w5[0] = __byte_perm (w2[0], w1[3], selector); w4[3] = __byte_perm (w1[3], w1[2], selector); w4[2] = __byte_perm (w1[2], w1[1], selector); w4[1] = __byte_perm (w1[1], w1[0], selector); w4[0] = __byte_perm (w1[0], w0[3], selector); w3[3] = __byte_perm (w0[3], w0[2], selector); w3[2] = __byte_perm (w0[2], w0[1], selector); w3[1] = __byte_perm (w0[1], w0[0], selector); w3[0] = __byte_perm (w0[0], 0, selector); w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 13: w7[3] = __byte_perm (w4[2], w4[1], selector); w7[2] = __byte_perm (w4[1], w4[0], selector); w7[1] = __byte_perm (w4[0], w3[3], selector); w7[0] = __byte_perm (w3[3], w3[2], selector); w6[3] = __byte_perm (w3[2], w3[1], selector); w6[2] = __byte_perm (w3[1], w3[0], selector); w6[1] = __byte_perm (w3[0], w2[3], selector); w6[0] = __byte_perm (w2[3], w2[2], selector); w5[3] = __byte_perm (w2[2], w2[1], selector); w5[2] = __byte_perm (w2[1], w2[0], selector); w5[1] = __byte_perm (w2[0], w1[3], selector); w5[0] = __byte_perm (w1[3], w1[2], selector); w4[3] = __byte_perm (w1[2], w1[1], selector); w4[2] = __byte_perm (w1[1], w1[0], selector); w4[1] = __byte_perm (w1[0], w0[3], selector); w4[0] = __byte_perm (w0[3], w0[2], selector); w3[3] = __byte_perm (w0[2], w0[1], selector); w3[2] = __byte_perm (w0[1], w0[0], selector); w3[1] = __byte_perm (w0[0], 0, selector); w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 14: w7[3] = __byte_perm (w4[1], w4[0], selector); w7[2] = __byte_perm (w4[0], w3[3], selector); w7[1] = __byte_perm (w3[3], w3[2], selector); w7[0] = __byte_perm (w3[2], w3[1], selector); w6[3] = __byte_perm (w3[1], w3[0], selector); w6[2] = __byte_perm (w3[0], w2[3], selector); w6[1] = __byte_perm (w2[3], w2[2], selector); w6[0] = __byte_perm (w2[2], w2[1], selector); w5[3] = __byte_perm (w2[1], w2[0], selector); w5[2] = __byte_perm (w2[0], w1[3], selector); w5[1] = __byte_perm (w1[3], w1[2], selector); w5[0] = __byte_perm (w1[2], w1[1], selector); w4[3] = __byte_perm (w1[1], w1[0], selector); w4[2] = __byte_perm (w1[0], w0[3], selector); w4[1] = __byte_perm (w0[3], w0[2], selector); w4[0] = __byte_perm (w0[2], w0[1], selector); w3[3] = __byte_perm (w0[1], w0[0], selector); w3[2] = __byte_perm (w0[0], 0, selector); w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 15: w7[3] = __byte_perm (w4[0], w3[3], selector); w7[2] = __byte_perm (w3[3], w3[2], selector); w7[1] = __byte_perm (w3[2], w3[1], selector); w7[0] = __byte_perm (w3[1], w3[0], selector); w6[3] = __byte_perm (w3[0], w2[3], selector); w6[2] = __byte_perm (w2[3], w2[2], selector); w6[1] = __byte_perm (w2[2], w2[1], selector); w6[0] = __byte_perm (w2[1], w2[0], selector); w5[3] = __byte_perm (w2[0], w1[3], selector); w5[2] = __byte_perm (w1[3], w1[2], selector); w5[1] = __byte_perm (w1[2], w1[1], selector); w5[0] = __byte_perm (w1[1], w1[0], selector); w4[3] = __byte_perm (w1[0], w0[3], selector); w4[2] = __byte_perm (w0[3], w0[2], selector); w4[1] = __byte_perm (w0[2], w0[1], selector); w4[0] = __byte_perm (w0[1], w0[0], selector); w3[3] = __byte_perm (w0[0], 0, selector); w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 16: w7[3] = __byte_perm (w3[3], w3[2], selector); w7[2] = __byte_perm (w3[2], w3[1], selector); w7[1] = __byte_perm (w3[1], w3[0], selector); w7[0] = __byte_perm (w3[0], w2[3], selector); w6[3] = __byte_perm (w2[3], w2[2], selector); w6[2] = __byte_perm (w2[2], w2[1], selector); w6[1] = __byte_perm (w2[1], w2[0], selector); w6[0] = __byte_perm (w2[0], w1[3], selector); w5[3] = __byte_perm (w1[3], w1[2], selector); w5[2] = __byte_perm (w1[2], w1[1], selector); w5[1] = __byte_perm (w1[1], w1[0], selector); w5[0] = __byte_perm (w1[0], w0[3], selector); w4[3] = __byte_perm (w0[3], w0[2], selector); w4[2] = __byte_perm (w0[2], w0[1], selector); w4[1] = __byte_perm (w0[1], w0[0], selector); w4[0] = __byte_perm (w0[0], 0, selector); w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 17: w7[3] = __byte_perm (w3[2], w3[1], selector); w7[2] = __byte_perm (w3[1], w3[0], selector); w7[1] = __byte_perm (w3[0], w2[3], selector); w7[0] = __byte_perm (w2[3], w2[2], selector); w6[3] = __byte_perm (w2[2], w2[1], selector); w6[2] = __byte_perm (w2[1], w2[0], selector); w6[1] = __byte_perm (w2[0], w1[3], selector); w6[0] = __byte_perm (w1[3], w1[2], selector); w5[3] = __byte_perm (w1[2], w1[1], selector); w5[2] = __byte_perm (w1[1], w1[0], selector); w5[1] = __byte_perm (w1[0], w0[3], selector); w5[0] = __byte_perm (w0[3], w0[2], selector); w4[3] = __byte_perm (w0[2], w0[1], selector); w4[2] = __byte_perm (w0[1], w0[0], selector); w4[1] = __byte_perm (w0[0], 0, selector); w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 18: w7[3] = __byte_perm (w3[1], w3[0], selector); w7[2] = __byte_perm (w3[0], w2[3], selector); w7[1] = __byte_perm (w2[3], w2[2], selector); w7[0] = __byte_perm (w2[2], w2[1], selector); w6[3] = __byte_perm (w2[1], w2[0], selector); w6[2] = __byte_perm (w2[0], w1[3], selector); w6[1] = __byte_perm (w1[3], w1[2], selector); w6[0] = __byte_perm (w1[2], w1[1], selector); w5[3] = __byte_perm (w1[1], w1[0], selector); w5[2] = __byte_perm (w1[0], w0[3], selector); w5[1] = __byte_perm (w0[3], w0[2], selector); w5[0] = __byte_perm (w0[2], w0[1], selector); w4[3] = __byte_perm (w0[1], w0[0], selector); w4[2] = __byte_perm (w0[0], 0, selector); w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 19: w7[3] = __byte_perm (w3[0], w2[3], selector); w7[2] = __byte_perm (w2[3], w2[2], selector); w7[1] = __byte_perm (w2[2], w2[1], selector); w7[0] = __byte_perm (w2[1], w2[0], selector); w6[3] = __byte_perm (w2[0], w1[3], selector); w6[2] = __byte_perm (w1[3], w1[2], selector); w6[1] = __byte_perm (w1[2], w1[1], selector); w6[0] = __byte_perm (w1[1], w1[0], selector); w5[3] = __byte_perm (w1[0], w0[3], selector); w5[2] = __byte_perm (w0[3], w0[2], selector); w5[1] = __byte_perm (w0[2], w0[1], selector); w5[0] = __byte_perm (w0[1], w0[0], selector); w4[3] = __byte_perm (w0[0], 0, selector); w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 20: w7[3] = __byte_perm (w2[3], w2[2], selector); w7[2] = __byte_perm (w2[2], w2[1], selector); w7[1] = __byte_perm (w2[1], w2[0], selector); w7[0] = __byte_perm (w2[0], w1[3], selector); w6[3] = __byte_perm (w1[3], w1[2], selector); w6[2] = __byte_perm (w1[2], w1[1], selector); w6[1] = __byte_perm (w1[1], w1[0], selector); w6[0] = __byte_perm (w1[0], w0[3], selector); w5[3] = __byte_perm (w0[3], w0[2], selector); w5[2] = __byte_perm (w0[2], w0[1], selector); w5[1] = __byte_perm (w0[1], w0[0], selector); w5[0] = __byte_perm (w0[0], 0, selector); w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 21: w7[3] = __byte_perm (w2[2], w2[1], selector); w7[2] = __byte_perm (w2[1], w2[0], selector); w7[1] = __byte_perm (w2[0], w1[3], selector); w7[0] = __byte_perm (w1[3], w1[2], selector); w6[3] = __byte_perm (w1[2], w1[1], selector); w6[2] = __byte_perm (w1[1], w1[0], selector); w6[1] = __byte_perm (w1[0], w0[3], selector); w6[0] = __byte_perm (w0[3], w0[2], selector); w5[3] = __byte_perm (w0[2], w0[1], selector); w5[2] = __byte_perm (w0[1], w0[0], selector); w5[1] = __byte_perm (w0[0], 0, selector); w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 22: w7[3] = __byte_perm (w2[1], w2[0], selector); w7[2] = __byte_perm (w2[0], w1[3], selector); w7[1] = __byte_perm (w1[3], w1[2], selector); w7[0] = __byte_perm (w1[2], w1[1], selector); w6[3] = __byte_perm (w1[1], w1[0], selector); w6[2] = __byte_perm (w1[0], w0[3], selector); w6[1] = __byte_perm (w0[3], w0[2], selector); w6[0] = __byte_perm (w0[2], w0[1], selector); w5[3] = __byte_perm (w0[1], w0[0], selector); w5[2] = __byte_perm (w0[0], 0, selector); w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 23: w7[3] = __byte_perm (w2[0], w1[3], selector); w7[2] = __byte_perm (w1[3], w1[2], selector); w7[1] = __byte_perm (w1[2], w1[1], selector); w7[0] = __byte_perm (w1[1], w1[0], selector); w6[3] = __byte_perm (w1[0], w0[3], selector); w6[2] = __byte_perm (w0[3], w0[2], selector); w6[1] = __byte_perm (w0[2], w0[1], selector); w6[0] = __byte_perm (w0[1], w0[0], selector); w5[3] = __byte_perm (w0[0], 0, selector); w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 24: w7[3] = __byte_perm (w1[3], w1[2], selector); w7[2] = __byte_perm (w1[2], w1[1], selector); w7[1] = __byte_perm (w1[1], w1[0], selector); w7[0] = __byte_perm (w1[0], w0[3], selector); w6[3] = __byte_perm (w0[3], w0[2], selector); w6[2] = __byte_perm (w0[2], w0[1], selector); w6[1] = __byte_perm (w0[1], w0[0], selector); w6[0] = __byte_perm (w0[0], 0, selector); w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 25: w7[3] = __byte_perm (w1[2], w1[1], selector); w7[2] = __byte_perm (w1[1], w1[0], selector); w7[1] = __byte_perm (w1[0], w0[3], selector); w7[0] = __byte_perm (w0[3], w0[2], selector); w6[3] = __byte_perm (w0[2], w0[1], selector); w6[2] = __byte_perm (w0[1], w0[0], selector); w6[1] = __byte_perm (w0[0], 0, selector); w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 26: w7[3] = __byte_perm (w1[1], w1[0], selector); w7[2] = __byte_perm (w1[0], w0[3], selector); w7[1] = __byte_perm (w0[3], w0[2], selector); w7[0] = __byte_perm (w0[2], w0[1], selector); w6[3] = __byte_perm (w0[1], w0[0], selector); w6[2] = __byte_perm (w0[0], 0, selector); w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 27: w7[3] = __byte_perm (w1[0], w0[3], selector); w7[2] = __byte_perm (w0[3], w0[2], selector); w7[1] = __byte_perm (w0[2], w0[1], selector); w7[0] = __byte_perm (w0[1], w0[0], selector); w6[3] = __byte_perm (w0[0], 0, selector); w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 28: w7[3] = __byte_perm (w0[3], w0[2], selector); w7[2] = __byte_perm (w0[2], w0[1], selector); w7[1] = __byte_perm (w0[1], w0[0], selector); w7[0] = __byte_perm (w0[0], 0, selector); w6[3] = 0; w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 29: w7[3] = __byte_perm (w0[2], w0[1], selector); w7[2] = __byte_perm (w0[1], w0[0], selector); w7[1] = __byte_perm (w0[0], 0, selector); w7[0] = 0; w6[3] = 0; w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 30: w7[3] = __byte_perm (w0[1], w0[0], selector); w7[2] = __byte_perm (w0[0], 0, selector); w7[1] = 0; w7[0] = 0; w6[3] = 0; w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 31: w7[3] = __byte_perm (w0[0], 0, selector); w7[2] = 0; w7[1] = 0; w7[0] = 0; w6[3] = 0; w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; } #endif } DECLSPEC void switch_buffer_by_offset_8x4_carry_be (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], u32x c0[4], u32x c1[4], u32x c2[4], u32x c3[4], u32x c4[4], u32x c5[4], u32x c6[4], u32x c7[4], const u32 offset) { const int offset_switch = offset / 4; #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC switch (offset_switch) { case 0: c0[0] = amd_bytealign (w7[3], 0, offset); w7[3] = amd_bytealign (w7[2], w7[3], offset); w7[2] = amd_bytealign (w7[1], w7[2], offset); w7[1] = amd_bytealign (w7[0], w7[1], offset); w7[0] = amd_bytealign (w6[3], w7[0], offset); w6[3] = amd_bytealign (w6[2], w6[3], offset); w6[2] = amd_bytealign (w6[1], w6[2], offset); w6[1] = amd_bytealign (w6[0], w6[1], offset); w6[0] = amd_bytealign (w5[3], w6[0], offset); w5[3] = amd_bytealign (w5[2], w5[3], offset); w5[2] = amd_bytealign (w5[1], w5[2], offset); w5[1] = amd_bytealign (w5[0], w5[1], offset); w5[0] = amd_bytealign (w4[3], w5[0], offset); w4[3] = amd_bytealign (w4[2], w4[3], offset); w4[2] = amd_bytealign (w4[1], w4[2], offset); w4[1] = amd_bytealign (w4[0], w4[1], offset); w4[0] = amd_bytealign (w3[3], w4[0], offset); w3[3] = amd_bytealign (w3[2], w3[3], offset); w3[2] = amd_bytealign (w3[1], w3[2], offset); w3[1] = amd_bytealign (w3[0], w3[1], offset); w3[0] = amd_bytealign (w2[3], w3[0], offset); w2[3] = amd_bytealign (w2[2], w2[3], offset); w2[2] = amd_bytealign (w2[1], w2[2], offset); w2[1] = amd_bytealign (w2[0], w2[1], offset); w2[0] = amd_bytealign (w1[3], w2[0], offset); w1[3] = amd_bytealign (w1[2], w1[3], offset); w1[2] = amd_bytealign (w1[1], w1[2], offset); w1[1] = amd_bytealign (w1[0], w1[1], offset); w1[0] = amd_bytealign (w0[3], w1[0], offset); w0[3] = amd_bytealign (w0[2], w0[3], offset); w0[2] = amd_bytealign (w0[1], w0[2], offset); w0[1] = amd_bytealign (w0[0], w0[1], offset); w0[0] = amd_bytealign ( 0, w0[0], offset); break; case 1: c0[1] = amd_bytealign (w7[3], 0, offset); c0[0] = amd_bytealign (w7[2], w7[3], offset); w7[3] = amd_bytealign (w7[1], w7[2], offset); w7[2] = amd_bytealign (w7[0], w7[1], offset); w7[1] = amd_bytealign (w6[3], w7[0], offset); w7[0] = amd_bytealign (w6[2], w6[3], offset); w6[3] = amd_bytealign (w6[1], w6[2], offset); w6[2] = amd_bytealign (w6[0], w6[1], offset); w6[1] = amd_bytealign (w5[3], w6[0], offset); w6[0] = amd_bytealign (w5[2], w5[3], offset); w5[3] = amd_bytealign (w5[1], w5[2], offset); w5[2] = amd_bytealign (w5[0], w5[1], offset); w5[1] = amd_bytealign (w4[3], w5[0], offset); w5[0] = amd_bytealign (w4[2], w4[3], offset); w4[3] = amd_bytealign (w4[1], w4[2], offset); w4[2] = amd_bytealign (w4[0], w4[1], offset); w4[1] = amd_bytealign (w3[3], w4[0], offset); w4[0] = amd_bytealign (w3[2], w3[3], offset); w3[3] = amd_bytealign (w3[1], w3[2], offset); w3[2] = amd_bytealign (w3[0], w3[1], offset); w3[1] = amd_bytealign (w2[3], w3[0], offset); w3[0] = amd_bytealign (w2[2], w2[3], offset); w2[3] = amd_bytealign (w2[1], w2[2], offset); w2[2] = amd_bytealign (w2[0], w2[1], offset); w2[1] = amd_bytealign (w1[3], w2[0], offset); w2[0] = amd_bytealign (w1[2], w1[3], offset); w1[3] = amd_bytealign (w1[1], w1[2], offset); w1[2] = amd_bytealign (w1[0], w1[1], offset); w1[1] = amd_bytealign (w0[3], w1[0], offset); w1[0] = amd_bytealign (w0[2], w0[3], offset); w0[3] = amd_bytealign (w0[1], w0[2], offset); w0[2] = amd_bytealign (w0[0], w0[1], offset); w0[1] = amd_bytealign ( 0, w0[0], offset); w0[0] = 0; break; case 2: c0[2] = amd_bytealign (w7[3], 0, offset); c0[1] = amd_bytealign (w7[2], w7[3], offset); c0[0] = amd_bytealign (w7[1], w7[2], offset); w7[3] = amd_bytealign (w7[0], w7[1], offset); w7[2] = amd_bytealign (w6[3], w7[0], offset); w7[1] = amd_bytealign (w6[2], w6[3], offset); w7[0] = amd_bytealign (w6[1], w6[2], offset); w6[3] = amd_bytealign (w6[0], w6[1], offset); w6[2] = amd_bytealign (w5[3], w6[0], offset); w6[1] = amd_bytealign (w5[2], w5[3], offset); w6[0] = amd_bytealign (w5[1], w5[2], offset); w5[3] = amd_bytealign (w5[0], w5[1], offset); w5[2] = amd_bytealign (w4[3], w5[0], offset); w5[1] = amd_bytealign (w4[2], w4[3], offset); w5[0] = amd_bytealign (w4[1], w4[2], offset); w4[3] = amd_bytealign (w4[0], w4[1], offset); w4[2] = amd_bytealign (w3[3], w4[0], offset); w4[1] = amd_bytealign (w3[2], w3[3], offset); w4[0] = amd_bytealign (w3[1], w3[2], offset); w3[3] = amd_bytealign (w3[0], w3[1], offset); w3[2] = amd_bytealign (w2[3], w3[0], offset); w3[1] = amd_bytealign (w2[2], w2[3], offset); w3[0] = amd_bytealign (w2[1], w2[2], offset); w2[3] = amd_bytealign (w2[0], w2[1], offset); w2[2] = amd_bytealign (w1[3], w2[0], offset); w2[1] = amd_bytealign (w1[2], w1[3], offset); w2[0] = amd_bytealign (w1[1], w1[2], offset); w1[3] = amd_bytealign (w1[0], w1[1], offset); w1[2] = amd_bytealign (w0[3], w1[0], offset); w1[1] = amd_bytealign (w0[2], w0[3], offset); w1[0] = amd_bytealign (w0[1], w0[2], offset); w0[3] = amd_bytealign (w0[0], w0[1], offset); w0[2] = amd_bytealign ( 0, w0[0], offset); w0[1] = 0; w0[0] = 0; break; case 3: c0[3] = amd_bytealign (w7[3], 0, offset); c0[2] = amd_bytealign (w7[2], w7[3], offset); c0[1] = amd_bytealign (w7[1], w7[2], offset); c0[0] = amd_bytealign (w7[0], w7[1], offset); w7[3] = amd_bytealign (w6[3], w7[0], offset); w7[2] = amd_bytealign (w6[2], w6[3], offset); w7[1] = amd_bytealign (w6[1], w6[2], offset); w7[0] = amd_bytealign (w6[0], w6[1], offset); w6[3] = amd_bytealign (w5[3], w6[0], offset); w6[2] = amd_bytealign (w5[2], w5[3], offset); w6[1] = amd_bytealign (w5[1], w5[2], offset); w6[0] = amd_bytealign (w5[0], w5[1], offset); w5[3] = amd_bytealign (w4[3], w5[0], offset); w5[2] = amd_bytealign (w4[2], w4[3], offset); w5[1] = amd_bytealign (w4[1], w4[2], offset); w5[0] = amd_bytealign (w4[0], w4[1], offset); w4[3] = amd_bytealign (w3[3], w4[0], offset); w4[2] = amd_bytealign (w3[2], w3[3], offset); w4[1] = amd_bytealign (w3[1], w3[2], offset); w4[0] = amd_bytealign (w3[0], w3[1], offset); w3[3] = amd_bytealign (w2[3], w3[0], offset); w3[2] = amd_bytealign (w2[2], w2[3], offset); w3[1] = amd_bytealign (w2[1], w2[2], offset); w3[0] = amd_bytealign (w2[0], w2[1], offset); w2[3] = amd_bytealign (w1[3], w2[0], offset); w2[2] = amd_bytealign (w1[2], w1[3], offset); w2[1] = amd_bytealign (w1[1], w1[2], offset); w2[0] = amd_bytealign (w1[0], w1[1], offset); w1[3] = amd_bytealign (w0[3], w1[0], offset); w1[2] = amd_bytealign (w0[2], w0[3], offset); w1[1] = amd_bytealign (w0[1], w0[2], offset); w1[0] = amd_bytealign (w0[0], w0[1], offset); w0[3] = amd_bytealign ( 0, w0[0], offset); w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 4: c1[0] = amd_bytealign (w7[3], 0, offset); c0[3] = amd_bytealign (w7[2], w7[3], offset); c0[2] = amd_bytealign (w7[1], w7[2], offset); c0[1] = amd_bytealign (w7[0], w7[1], offset); c0[0] = amd_bytealign (w6[3], w7[0], offset); w7[3] = amd_bytealign (w6[2], w6[3], offset); w7[2] = amd_bytealign (w6[1], w6[2], offset); w7[1] = amd_bytealign (w6[0], w6[1], offset); w7[0] = amd_bytealign (w5[3], w6[0], offset); w6[3] = amd_bytealign (w5[2], w5[3], offset); w6[2] = amd_bytealign (w5[1], w5[2], offset); w6[1] = amd_bytealign (w5[0], w5[1], offset); w6[0] = amd_bytealign (w4[3], w5[0], offset); w5[3] = amd_bytealign (w4[2], w4[3], offset); w5[2] = amd_bytealign (w4[1], w4[2], offset); w5[1] = amd_bytealign (w4[0], w4[1], offset); w5[0] = amd_bytealign (w3[3], w4[0], offset); w4[3] = amd_bytealign (w3[2], w3[3], offset); w4[2] = amd_bytealign (w3[1], w3[2], offset); w4[1] = amd_bytealign (w3[0], w3[1], offset); w4[0] = amd_bytealign (w2[3], w3[0], offset); w3[3] = amd_bytealign (w2[2], w2[3], offset); w3[2] = amd_bytealign (w2[1], w2[2], offset); w3[1] = amd_bytealign (w2[0], w2[1], offset); w3[0] = amd_bytealign (w1[3], w2[0], offset); w2[3] = amd_bytealign (w1[2], w1[3], offset); w2[2] = amd_bytealign (w1[1], w1[2], offset); w2[1] = amd_bytealign (w1[0], w1[1], offset); w2[0] = amd_bytealign (w0[3], w1[0], offset); w1[3] = amd_bytealign (w0[2], w0[3], offset); w1[2] = amd_bytealign (w0[1], w0[2], offset); w1[1] = amd_bytealign (w0[0], w0[1], offset); w1[0] = amd_bytealign ( 0, w0[0], offset); w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 5: c1[1] = amd_bytealign (w7[3], 0, offset); c1[0] = amd_bytealign (w7[2], w7[3], offset); c0[3] = amd_bytealign (w7[1], w7[2], offset); c0[2] = amd_bytealign (w7[0], w7[1], offset); c0[1] = amd_bytealign (w6[3], w7[0], offset); c0[0] = amd_bytealign (w6[2], w6[3], offset); w7[3] = amd_bytealign (w6[1], w6[2], offset); w7[2] = amd_bytealign (w6[0], w6[1], offset); w7[1] = amd_bytealign (w5[3], w6[0], offset); w7[0] = amd_bytealign (w5[2], w5[3], offset); w6[3] = amd_bytealign (w5[1], w5[2], offset); w6[2] = amd_bytealign (w5[0], w5[1], offset); w6[1] = amd_bytealign (w4[3], w5[0], offset); w6[0] = amd_bytealign (w4[2], w4[3], offset); w5[3] = amd_bytealign (w4[1], w4[2], offset); w5[2] = amd_bytealign (w4[0], w4[1], offset); w5[1] = amd_bytealign (w3[3], w4[0], offset); w5[0] = amd_bytealign (w3[2], w3[3], offset); w4[3] = amd_bytealign (w3[1], w3[2], offset); w4[2] = amd_bytealign (w3[0], w3[1], offset); w4[1] = amd_bytealign (w2[3], w3[0], offset); w4[0] = amd_bytealign (w2[2], w2[3], offset); w3[3] = amd_bytealign (w2[1], w2[2], offset); w3[2] = amd_bytealign (w2[0], w2[1], offset); w3[1] = amd_bytealign (w1[3], w2[0], offset); w3[0] = amd_bytealign (w1[2], w1[3], offset); w2[3] = amd_bytealign (w1[1], w1[2], offset); w2[2] = amd_bytealign (w1[0], w1[1], offset); w2[1] = amd_bytealign (w0[3], w1[0], offset); w2[0] = amd_bytealign (w0[2], w0[3], offset); w1[3] = amd_bytealign (w0[1], w0[2], offset); w1[2] = amd_bytealign (w0[0], w0[1], offset); w1[1] = amd_bytealign ( 0, w0[0], offset); w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 6: c1[2] = amd_bytealign (w7[3], 0, offset); c1[1] = amd_bytealign (w7[2], w7[3], offset); c1[0] = amd_bytealign (w7[1], w7[2], offset); c0[3] = amd_bytealign (w7[0], w7[1], offset); c0[2] = amd_bytealign (w6[3], w7[0], offset); c0[1] = amd_bytealign (w6[2], w6[3], offset); c0[0] = amd_bytealign (w6[1], w6[2], offset); w7[3] = amd_bytealign (w6[0], w6[1], offset); w7[2] = amd_bytealign (w5[3], w6[0], offset); w7[1] = amd_bytealign (w5[2], w5[3], offset); w7[0] = amd_bytealign (w5[1], w5[2], offset); w6[3] = amd_bytealign (w5[0], w5[1], offset); w6[2] = amd_bytealign (w4[3], w5[0], offset); w6[1] = amd_bytealign (w4[2], w4[3], offset); w6[0] = amd_bytealign (w4[1], w4[2], offset); w5[3] = amd_bytealign (w4[0], w4[1], offset); w5[2] = amd_bytealign (w3[3], w4[0], offset); w5[1] = amd_bytealign (w3[2], w3[3], offset); w5[0] = amd_bytealign (w3[1], w3[2], offset); w4[3] = amd_bytealign (w3[0], w3[1], offset); w4[2] = amd_bytealign (w2[3], w3[0], offset); w4[1] = amd_bytealign (w2[2], w2[3], offset); w4[0] = amd_bytealign (w2[1], w2[2], offset); w3[3] = amd_bytealign (w2[0], w2[1], offset); w3[2] = amd_bytealign (w1[3], w2[0], offset); w3[1] = amd_bytealign (w1[2], w1[3], offset); w3[0] = amd_bytealign (w1[1], w1[2], offset); w2[3] = amd_bytealign (w1[0], w1[1], offset); w2[2] = amd_bytealign (w0[3], w1[0], offset); w2[1] = amd_bytealign (w0[2], w0[3], offset); w2[0] = amd_bytealign (w0[1], w0[2], offset); w1[3] = amd_bytealign (w0[0], w0[1], offset); w1[2] = amd_bytealign ( 0, w0[0], offset); w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 7: c1[3] = amd_bytealign (w7[3], 0, offset); c1[2] = amd_bytealign (w7[2], w7[3], offset); c1[1] = amd_bytealign (w7[1], w7[2], offset); c1[0] = amd_bytealign (w7[0], w7[1], offset); c0[3] = amd_bytealign (w6[3], w7[0], offset); c0[2] = amd_bytealign (w6[2], w6[3], offset); c0[1] = amd_bytealign (w6[1], w6[2], offset); c0[0] = amd_bytealign (w6[0], w6[1], offset); w7[3] = amd_bytealign (w5[3], w6[0], offset); w7[2] = amd_bytealign (w5[2], w5[3], offset); w7[1] = amd_bytealign (w5[1], w5[2], offset); w7[0] = amd_bytealign (w5[0], w5[1], offset); w6[3] = amd_bytealign (w4[3], w5[0], offset); w6[2] = amd_bytealign (w4[2], w4[3], offset); w6[1] = amd_bytealign (w4[1], w4[2], offset); w6[0] = amd_bytealign (w4[0], w4[1], offset); w5[3] = amd_bytealign (w3[3], w4[0], offset); w5[2] = amd_bytealign (w3[2], w3[3], offset); w5[1] = amd_bytealign (w3[1], w3[2], offset); w5[0] = amd_bytealign (w3[0], w3[1], offset); w4[3] = amd_bytealign (w2[3], w3[0], offset); w4[2] = amd_bytealign (w2[2], w2[3], offset); w4[1] = amd_bytealign (w2[1], w2[2], offset); w4[0] = amd_bytealign (w2[0], w2[1], offset); w3[3] = amd_bytealign (w1[3], w2[0], offset); w3[2] = amd_bytealign (w1[2], w1[3], offset); w3[1] = amd_bytealign (w1[1], w1[2], offset); w3[0] = amd_bytealign (w1[0], w1[1], offset); w2[3] = amd_bytealign (w0[3], w1[0], offset); w2[2] = amd_bytealign (w0[2], w0[3], offset); w2[1] = amd_bytealign (w0[1], w0[2], offset); w2[0] = amd_bytealign (w0[0], w0[1], offset); w1[3] = amd_bytealign ( 0, w0[0], offset); w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 8: c2[0] = amd_bytealign (w7[3], 0, offset); c1[3] = amd_bytealign (w7[2], w7[3], offset); c1[2] = amd_bytealign (w7[1], w7[2], offset); c1[1] = amd_bytealign (w7[0], w7[1], offset); c1[0] = amd_bytealign (w6[3], w7[0], offset); c0[3] = amd_bytealign (w6[2], w6[3], offset); c0[2] = amd_bytealign (w6[1], w6[2], offset); c0[1] = amd_bytealign (w6[0], w6[1], offset); c0[0] = amd_bytealign (w5[3], w6[0], offset); w7[3] = amd_bytealign (w5[2], w5[3], offset); w7[2] = amd_bytealign (w5[1], w5[2], offset); w7[1] = amd_bytealign (w5[0], w5[1], offset); w7[0] = amd_bytealign (w4[3], w5[0], offset); w6[3] = amd_bytealign (w4[2], w4[3], offset); w6[2] = amd_bytealign (w4[1], w4[2], offset); w6[1] = amd_bytealign (w4[0], w4[1], offset); w6[0] = amd_bytealign (w3[3], w4[0], offset); w5[3] = amd_bytealign (w3[2], w3[3], offset); w5[2] = amd_bytealign (w3[1], w3[2], offset); w5[1] = amd_bytealign (w3[0], w3[1], offset); w5[0] = amd_bytealign (w2[3], w3[0], offset); w4[3] = amd_bytealign (w2[2], w2[3], offset); w4[2] = amd_bytealign (w2[1], w2[2], offset); w4[1] = amd_bytealign (w2[0], w2[1], offset); w4[0] = amd_bytealign (w1[3], w2[0], offset); w3[3] = amd_bytealign (w1[2], w1[3], offset); w3[2] = amd_bytealign (w1[1], w1[2], offset); w3[1] = amd_bytealign (w1[0], w1[1], offset); w3[0] = amd_bytealign (w0[3], w1[0], offset); w2[3] = amd_bytealign (w0[2], w0[3], offset); w2[2] = amd_bytealign (w0[1], w0[2], offset); w2[1] = amd_bytealign (w0[0], w0[1], offset); w2[0] = amd_bytealign ( 0, w0[0], offset); w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 9: c2[1] = amd_bytealign (w7[3], 0, offset); c2[0] = amd_bytealign (w7[2], w7[3], offset); c1[3] = amd_bytealign (w7[1], w7[2], offset); c1[2] = amd_bytealign (w7[0], w7[1], offset); c1[1] = amd_bytealign (w6[3], w7[0], offset); c1[0] = amd_bytealign (w6[2], w6[3], offset); c0[3] = amd_bytealign (w6[1], w6[2], offset); c0[2] = amd_bytealign (w6[0], w6[1], offset); c0[1] = amd_bytealign (w5[3], w6[0], offset); c0[0] = amd_bytealign (w5[2], w5[3], offset); w7[3] = amd_bytealign (w5[1], w5[2], offset); w7[2] = amd_bytealign (w5[0], w5[1], offset); w7[1] = amd_bytealign (w4[3], w5[0], offset); w7[0] = amd_bytealign (w4[2], w4[3], offset); w6[3] = amd_bytealign (w4[1], w4[2], offset); w6[2] = amd_bytealign (w4[0], w4[1], offset); w6[1] = amd_bytealign (w3[3], w4[0], offset); w6[0] = amd_bytealign (w3[2], w3[3], offset); w5[3] = amd_bytealign (w3[1], w3[2], offset); w5[2] = amd_bytealign (w3[0], w3[1], offset); w5[1] = amd_bytealign (w2[3], w3[0], offset); w5[0] = amd_bytealign (w2[2], w2[3], offset); w4[3] = amd_bytealign (w2[1], w2[2], offset); w4[2] = amd_bytealign (w2[0], w2[1], offset); w4[1] = amd_bytealign (w1[3], w2[0], offset); w4[0] = amd_bytealign (w1[2], w1[3], offset); w3[3] = amd_bytealign (w1[1], w1[2], offset); w3[2] = amd_bytealign (w1[0], w1[1], offset); w3[1] = amd_bytealign (w0[3], w1[0], offset); w3[0] = amd_bytealign (w0[2], w0[3], offset); w2[3] = amd_bytealign (w0[1], w0[2], offset); w2[2] = amd_bytealign (w0[0], w0[1], offset); w2[1] = amd_bytealign ( 0, w0[0], offset); w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 10: c2[2] = amd_bytealign (w7[3], 0, offset); c2[1] = amd_bytealign (w7[2], w7[3], offset); c2[0] = amd_bytealign (w7[1], w7[2], offset); c1[3] = amd_bytealign (w7[0], w7[1], offset); c1[2] = amd_bytealign (w6[3], w7[0], offset); c1[1] = amd_bytealign (w6[2], w6[3], offset); c1[0] = amd_bytealign (w6[1], w6[2], offset); c0[3] = amd_bytealign (w6[0], w6[1], offset); c0[2] = amd_bytealign (w5[3], w6[0], offset); c0[1] = amd_bytealign (w5[2], w5[3], offset); c0[0] = amd_bytealign (w5[1], w5[2], offset); w7[3] = amd_bytealign (w5[0], w5[1], offset); w7[2] = amd_bytealign (w4[3], w5[0], offset); w7[1] = amd_bytealign (w4[2], w4[3], offset); w7[0] = amd_bytealign (w4[1], w4[2], offset); w6[3] = amd_bytealign (w4[0], w4[1], offset); w6[2] = amd_bytealign (w3[3], w4[0], offset); w6[1] = amd_bytealign (w3[2], w3[3], offset); w6[0] = amd_bytealign (w3[1], w3[2], offset); w5[3] = amd_bytealign (w3[0], w3[1], offset); w5[2] = amd_bytealign (w2[3], w3[0], offset); w5[1] = amd_bytealign (w2[2], w2[3], offset); w5[0] = amd_bytealign (w2[1], w2[2], offset); w4[3] = amd_bytealign (w2[0], w2[1], offset); w4[2] = amd_bytealign (w1[3], w2[0], offset); w4[1] = amd_bytealign (w1[2], w1[3], offset); w4[0] = amd_bytealign (w1[1], w1[2], offset); w3[3] = amd_bytealign (w1[0], w1[1], offset); w3[2] = amd_bytealign (w0[3], w1[0], offset); w3[1] = amd_bytealign (w0[2], w0[3], offset); w3[0] = amd_bytealign (w0[1], w0[2], offset); w2[3] = amd_bytealign (w0[0], w0[1], offset); w2[2] = amd_bytealign ( 0, w0[0], offset); w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 11: c2[3] = amd_bytealign (w7[3], 0, offset); c2[2] = amd_bytealign (w7[2], w7[3], offset); c2[1] = amd_bytealign (w7[1], w7[2], offset); c2[0] = amd_bytealign (w7[0], w7[1], offset); c1[3] = amd_bytealign (w6[3], w7[0], offset); c1[2] = amd_bytealign (w6[2], w6[3], offset); c1[1] = amd_bytealign (w6[1], w6[2], offset); c1[0] = amd_bytealign (w6[0], w6[1], offset); c0[3] = amd_bytealign (w5[3], w6[0], offset); c0[2] = amd_bytealign (w5[2], w5[3], offset); c0[1] = amd_bytealign (w5[1], w5[2], offset); c0[0] = amd_bytealign (w5[0], w5[1], offset); w7[3] = amd_bytealign (w4[3], w5[0], offset); w7[2] = amd_bytealign (w4[2], w4[3], offset); w7[1] = amd_bytealign (w4[1], w4[2], offset); w7[0] = amd_bytealign (w4[0], w4[1], offset); w6[3] = amd_bytealign (w3[3], w4[0], offset); w6[2] = amd_bytealign (w3[2], w3[3], offset); w6[1] = amd_bytealign (w3[1], w3[2], offset); w6[0] = amd_bytealign (w3[0], w3[1], offset); w5[3] = amd_bytealign (w2[3], w3[0], offset); w5[2] = amd_bytealign (w2[2], w2[3], offset); w5[1] = amd_bytealign (w2[1], w2[2], offset); w5[0] = amd_bytealign (w2[0], w2[1], offset); w4[3] = amd_bytealign (w1[3], w2[0], offset); w4[2] = amd_bytealign (w1[2], w1[3], offset); w4[1] = amd_bytealign (w1[1], w1[2], offset); w4[0] = amd_bytealign (w1[0], w1[1], offset); w3[3] = amd_bytealign (w0[3], w1[0], offset); w3[2] = amd_bytealign (w0[2], w0[3], offset); w3[1] = amd_bytealign (w0[1], w0[2], offset); w3[0] = amd_bytealign (w0[0], w0[1], offset); w2[3] = amd_bytealign ( 0, w0[0], offset); w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 12: c3[0] = amd_bytealign (w7[3], 0, offset); c2[3] = amd_bytealign (w7[2], w7[3], offset); c2[2] = amd_bytealign (w7[1], w7[2], offset); c2[1] = amd_bytealign (w7[0], w7[1], offset); c2[0] = amd_bytealign (w6[3], w7[0], offset); c1[3] = amd_bytealign (w6[2], w6[3], offset); c1[2] = amd_bytealign (w6[1], w6[2], offset); c1[1] = amd_bytealign (w6[0], w6[1], offset); c1[0] = amd_bytealign (w5[3], w6[0], offset); c0[3] = amd_bytealign (w5[2], w5[3], offset); c0[2] = amd_bytealign (w5[1], w5[2], offset); c0[1] = amd_bytealign (w5[0], w5[1], offset); c0[0] = amd_bytealign (w4[3], w5[0], offset); w7[3] = amd_bytealign (w4[2], w4[3], offset); w7[2] = amd_bytealign (w4[1], w4[2], offset); w7[1] = amd_bytealign (w4[0], w4[1], offset); w7[0] = amd_bytealign (w3[3], w4[0], offset); w6[3] = amd_bytealign (w3[2], w3[3], offset); w6[2] = amd_bytealign (w3[1], w3[2], offset); w6[1] = amd_bytealign (w3[0], w3[1], offset); w6[0] = amd_bytealign (w2[3], w3[0], offset); w5[3] = amd_bytealign (w2[2], w2[3], offset); w5[2] = amd_bytealign (w2[1], w2[2], offset); w5[1] = amd_bytealign (w2[0], w2[1], offset); w5[0] = amd_bytealign (w1[3], w2[0], offset); w4[3] = amd_bytealign (w1[2], w1[3], offset); w4[2] = amd_bytealign (w1[1], w1[2], offset); w4[1] = amd_bytealign (w1[0], w1[1], offset); w4[0] = amd_bytealign (w0[3], w1[0], offset); w3[3] = amd_bytealign (w0[2], w0[3], offset); w3[2] = amd_bytealign (w0[1], w0[2], offset); w3[1] = amd_bytealign (w0[0], w0[1], offset); w3[0] = amd_bytealign ( 0, w0[0], offset); w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 13: c3[1] = amd_bytealign (w7[3], 0, offset); c3[0] = amd_bytealign (w7[2], w7[3], offset); c2[3] = amd_bytealign (w7[1], w7[2], offset); c2[2] = amd_bytealign (w7[0], w7[1], offset); c2[1] = amd_bytealign (w6[3], w7[0], offset); c2[0] = amd_bytealign (w6[2], w6[3], offset); c1[3] = amd_bytealign (w6[1], w6[2], offset); c1[2] = amd_bytealign (w6[0], w6[1], offset); c1[1] = amd_bytealign (w5[3], w6[0], offset); c1[0] = amd_bytealign (w5[2], w5[3], offset); c0[3] = amd_bytealign (w5[1], w5[2], offset); c0[2] = amd_bytealign (w5[0], w5[1], offset); c0[1] = amd_bytealign (w4[3], w5[0], offset); c0[0] = amd_bytealign (w4[2], w4[3], offset); w7[3] = amd_bytealign (w4[1], w4[2], offset); w7[2] = amd_bytealign (w4[0], w4[1], offset); w7[1] = amd_bytealign (w3[3], w4[0], offset); w7[0] = amd_bytealign (w3[2], w3[3], offset); w6[3] = amd_bytealign (w3[1], w3[2], offset); w6[2] = amd_bytealign (w3[0], w3[1], offset); w6[1] = amd_bytealign (w2[3], w3[0], offset); w6[0] = amd_bytealign (w2[2], w2[3], offset); w5[3] = amd_bytealign (w2[1], w2[2], offset); w5[2] = amd_bytealign (w2[0], w2[1], offset); w5[1] = amd_bytealign (w1[3], w2[0], offset); w5[0] = amd_bytealign (w1[2], w1[3], offset); w4[3] = amd_bytealign (w1[1], w1[2], offset); w4[2] = amd_bytealign (w1[0], w1[1], offset); w4[1] = amd_bytealign (w0[3], w1[0], offset); w4[0] = amd_bytealign (w0[2], w0[3], offset); w3[3] = amd_bytealign (w0[1], w0[2], offset); w3[2] = amd_bytealign (w0[0], w0[1], offset); w3[1] = amd_bytealign ( 0, w0[0], offset); w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 14: c3[2] = amd_bytealign (w7[3], 0, offset); c3[1] = amd_bytealign (w7[2], w7[3], offset); c3[0] = amd_bytealign (w7[1], w7[2], offset); c2[3] = amd_bytealign (w7[0], w7[1], offset); c2[2] = amd_bytealign (w6[3], w7[0], offset); c2[1] = amd_bytealign (w6[2], w6[3], offset); c2[0] = amd_bytealign (w6[1], w6[2], offset); c1[3] = amd_bytealign (w6[0], w6[1], offset); c1[2] = amd_bytealign (w5[3], w6[0], offset); c1[1] = amd_bytealign (w5[2], w5[3], offset); c1[0] = amd_bytealign (w5[1], w5[2], offset); c0[3] = amd_bytealign (w5[0], w5[1], offset); c0[2] = amd_bytealign (w4[3], w5[0], offset); c0[1] = amd_bytealign (w4[2], w4[3], offset); c0[0] = amd_bytealign (w4[1], w4[2], offset); w7[3] = amd_bytealign (w4[0], w4[1], offset); w7[2] = amd_bytealign (w3[3], w4[0], offset); w7[1] = amd_bytealign (w3[2], w3[3], offset); w7[0] = amd_bytealign (w3[1], w3[2], offset); w6[3] = amd_bytealign (w3[0], w3[1], offset); w6[2] = amd_bytealign (w2[3], w3[0], offset); w6[1] = amd_bytealign (w2[2], w2[3], offset); w6[0] = amd_bytealign (w2[1], w2[2], offset); w5[3] = amd_bytealign (w2[0], w2[1], offset); w5[2] = amd_bytealign (w1[3], w2[0], offset); w5[1] = amd_bytealign (w1[2], w1[3], offset); w5[0] = amd_bytealign (w1[1], w1[2], offset); w4[3] = amd_bytealign (w1[0], w1[1], offset); w4[2] = amd_bytealign (w0[3], w1[0], offset); w4[1] = amd_bytealign (w0[2], w0[3], offset); w4[0] = amd_bytealign (w0[1], w0[2], offset); w3[3] = amd_bytealign (w0[0], w0[1], offset); w3[2] = amd_bytealign ( 0, w0[0], offset); w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 15: c3[3] = amd_bytealign (w7[3], 0, offset); c3[2] = amd_bytealign (w7[2], w7[3], offset); c3[1] = amd_bytealign (w7[1], w7[2], offset); c3[0] = amd_bytealign (w7[0], w7[1], offset); c2[3] = amd_bytealign (w6[3], w7[0], offset); c2[2] = amd_bytealign (w6[2], w6[3], offset); c2[1] = amd_bytealign (w6[1], w6[2], offset); c2[0] = amd_bytealign (w6[0], w6[1], offset); c1[3] = amd_bytealign (w5[3], w6[0], offset); c1[2] = amd_bytealign (w5[2], w5[3], offset); c1[1] = amd_bytealign (w5[1], w5[2], offset); c1[0] = amd_bytealign (w5[0], w5[1], offset); c0[3] = amd_bytealign (w4[3], w5[0], offset); c0[2] = amd_bytealign (w4[2], w4[3], offset); c0[1] = amd_bytealign (w4[1], w4[2], offset); c0[0] = amd_bytealign (w4[0], w4[1], offset); w7[3] = amd_bytealign (w3[3], w4[0], offset); w7[2] = amd_bytealign (w3[2], w3[3], offset); w7[1] = amd_bytealign (w3[1], w3[2], offset); w7[0] = amd_bytealign (w3[0], w3[1], offset); w6[3] = amd_bytealign (w2[3], w3[0], offset); w6[2] = amd_bytealign (w2[2], w2[3], offset); w6[1] = amd_bytealign (w2[1], w2[2], offset); w6[0] = amd_bytealign (w2[0], w2[1], offset); w5[3] = amd_bytealign (w1[3], w2[0], offset); w5[2] = amd_bytealign (w1[2], w1[3], offset); w5[1] = amd_bytealign (w1[1], w1[2], offset); w5[0] = amd_bytealign (w1[0], w1[1], offset); w4[3] = amd_bytealign (w0[3], w1[0], offset); w4[2] = amd_bytealign (w0[2], w0[3], offset); w4[1] = amd_bytealign (w0[1], w0[2], offset); w4[0] = amd_bytealign (w0[0], w0[1], offset); w3[3] = amd_bytealign ( 0, w0[0], offset); w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 16: c4[0] = amd_bytealign (w7[3], 0, offset); c3[3] = amd_bytealign (w7[2], w7[3], offset); c3[2] = amd_bytealign (w7[1], w7[2], offset); c3[1] = amd_bytealign (w7[0], w7[1], offset); c3[0] = amd_bytealign (w6[3], w7[0], offset); c2[3] = amd_bytealign (w6[2], w6[3], offset); c2[2] = amd_bytealign (w6[1], w6[2], offset); c2[1] = amd_bytealign (w6[0], w6[1], offset); c2[0] = amd_bytealign (w5[3], w6[0], offset); c1[3] = amd_bytealign (w5[2], w5[3], offset); c1[2] = amd_bytealign (w5[1], w5[2], offset); c1[1] = amd_bytealign (w5[0], w5[1], offset); c1[0] = amd_bytealign (w4[3], w5[0], offset); c0[3] = amd_bytealign (w4[2], w4[3], offset); c0[2] = amd_bytealign (w4[1], w4[2], offset); c0[1] = amd_bytealign (w4[0], w4[1], offset); c0[0] = amd_bytealign (w3[3], w4[0], offset); w7[3] = amd_bytealign (w3[2], w3[3], offset); w7[2] = amd_bytealign (w3[1], w3[2], offset); w7[1] = amd_bytealign (w3[0], w3[1], offset); w7[0] = amd_bytealign (w2[3], w3[0], offset); w6[3] = amd_bytealign (w2[2], w2[3], offset); w6[2] = amd_bytealign (w2[1], w2[2], offset); w6[1] = amd_bytealign (w2[0], w2[1], offset); w6[0] = amd_bytealign (w1[3], w2[0], offset); w5[3] = amd_bytealign (w1[2], w1[3], offset); w5[2] = amd_bytealign (w1[1], w1[2], offset); w5[1] = amd_bytealign (w1[0], w1[1], offset); w5[0] = amd_bytealign (w0[3], w1[0], offset); w4[3] = amd_bytealign (w0[2], w0[3], offset); w4[2] = amd_bytealign (w0[1], w0[2], offset); w4[1] = amd_bytealign (w0[0], w0[1], offset); w4[0] = amd_bytealign ( 0, w0[0], offset); w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 17: c4[1] = amd_bytealign (w7[3], 0, offset); c4[0] = amd_bytealign (w7[2], w7[3], offset); c3[3] = amd_bytealign (w7[1], w7[2], offset); c3[2] = amd_bytealign (w7[0], w7[1], offset); c3[1] = amd_bytealign (w6[3], w7[0], offset); c3[0] = amd_bytealign (w6[2], w6[3], offset); c2[3] = amd_bytealign (w6[1], w6[2], offset); c2[2] = amd_bytealign (w6[0], w6[1], offset); c2[1] = amd_bytealign (w5[3], w6[0], offset); c2[0] = amd_bytealign (w5[2], w5[3], offset); c1[3] = amd_bytealign (w5[1], w5[2], offset); c1[2] = amd_bytealign (w5[0], w5[1], offset); c1[1] = amd_bytealign (w4[3], w5[0], offset); c1[0] = amd_bytealign (w4[2], w4[3], offset); c0[3] = amd_bytealign (w4[1], w4[2], offset); c0[2] = amd_bytealign (w4[0], w4[1], offset); c0[1] = amd_bytealign (w3[3], w4[0], offset); c0[0] = amd_bytealign (w3[2], w3[3], offset); w7[3] = amd_bytealign (w3[1], w3[2], offset); w7[2] = amd_bytealign (w3[0], w3[1], offset); w7[1] = amd_bytealign (w2[3], w3[0], offset); w7[0] = amd_bytealign (w2[2], w2[3], offset); w6[3] = amd_bytealign (w2[1], w2[2], offset); w6[2] = amd_bytealign (w2[0], w2[1], offset); w6[1] = amd_bytealign (w1[3], w2[0], offset); w6[0] = amd_bytealign (w1[2], w1[3], offset); w5[3] = amd_bytealign (w1[1], w1[2], offset); w5[2] = amd_bytealign (w1[0], w1[1], offset); w5[1] = amd_bytealign (w0[3], w1[0], offset); w5[0] = amd_bytealign (w0[2], w0[3], offset); w4[3] = amd_bytealign (w0[1], w0[2], offset); w4[2] = amd_bytealign (w0[0], w0[1], offset); w4[1] = amd_bytealign ( 0, w0[0], offset); w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 18: c4[2] = amd_bytealign (w7[3], 0, offset); c4[1] = amd_bytealign (w7[2], w7[3], offset); c4[0] = amd_bytealign (w7[1], w7[2], offset); c3[3] = amd_bytealign (w7[0], w7[1], offset); c3[2] = amd_bytealign (w6[3], w7[0], offset); c3[1] = amd_bytealign (w6[2], w6[3], offset); c3[0] = amd_bytealign (w6[1], w6[2], offset); c2[3] = amd_bytealign (w6[0], w6[1], offset); c2[2] = amd_bytealign (w5[3], w6[0], offset); c2[1] = amd_bytealign (w5[2], w5[3], offset); c2[0] = amd_bytealign (w5[1], w5[2], offset); c1[3] = amd_bytealign (w5[0], w5[1], offset); c1[2] = amd_bytealign (w4[3], w5[0], offset); c1[1] = amd_bytealign (w4[2], w4[3], offset); c1[0] = amd_bytealign (w4[1], w4[2], offset); c0[3] = amd_bytealign (w4[0], w4[1], offset); c0[2] = amd_bytealign (w3[3], w4[0], offset); c0[1] = amd_bytealign (w3[2], w3[3], offset); c0[0] = amd_bytealign (w3[1], w3[2], offset); w7[3] = amd_bytealign (w3[0], w3[1], offset); w7[2] = amd_bytealign (w2[3], w3[0], offset); w7[1] = amd_bytealign (w2[2], w2[3], offset); w7[0] = amd_bytealign (w2[1], w2[2], offset); w6[3] = amd_bytealign (w2[0], w2[1], offset); w6[2] = amd_bytealign (w1[3], w2[0], offset); w6[1] = amd_bytealign (w1[2], w1[3], offset); w6[0] = amd_bytealign (w1[1], w1[2], offset); w5[3] = amd_bytealign (w1[0], w1[1], offset); w5[2] = amd_bytealign (w0[3], w1[0], offset); w5[1] = amd_bytealign (w0[2], w0[3], offset); w5[0] = amd_bytealign (w0[1], w0[2], offset); w4[3] = amd_bytealign (w0[0], w0[1], offset); w4[2] = amd_bytealign ( 0, w0[0], offset); w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 19: c4[3] = amd_bytealign (w7[3], 0, offset); c4[2] = amd_bytealign (w7[2], w7[3], offset); c4[1] = amd_bytealign (w7[1], w7[2], offset); c4[0] = amd_bytealign (w7[0], w7[1], offset); c3[3] = amd_bytealign (w6[3], w7[0], offset); c3[2] = amd_bytealign (w6[2], w6[3], offset); c3[1] = amd_bytealign (w6[1], w6[2], offset); c3[0] = amd_bytealign (w6[0], w6[1], offset); c2[3] = amd_bytealign (w5[3], w6[0], offset); c2[2] = amd_bytealign (w5[2], w5[3], offset); c2[1] = amd_bytealign (w5[1], w5[2], offset); c2[0] = amd_bytealign (w5[0], w5[1], offset); c1[3] = amd_bytealign (w4[3], w5[0], offset); c1[2] = amd_bytealign (w4[2], w4[3], offset); c1[1] = amd_bytealign (w4[1], w4[2], offset); c1[0] = amd_bytealign (w4[0], w4[1], offset); c0[3] = amd_bytealign (w3[3], w4[0], offset); c0[2] = amd_bytealign (w3[2], w3[3], offset); c0[1] = amd_bytealign (w3[1], w3[2], offset); c0[0] = amd_bytealign (w3[0], w3[1], offset); w7[3] = amd_bytealign (w2[3], w3[0], offset); w7[2] = amd_bytealign (w2[2], w2[3], offset); w7[1] = amd_bytealign (w2[1], w2[2], offset); w7[0] = amd_bytealign (w2[0], w2[1], offset); w6[3] = amd_bytealign (w1[3], w2[0], offset); w6[2] = amd_bytealign (w1[2], w1[3], offset); w6[1] = amd_bytealign (w1[1], w1[2], offset); w6[0] = amd_bytealign (w1[0], w1[1], offset); w5[3] = amd_bytealign (w0[3], w1[0], offset); w5[2] = amd_bytealign (w0[2], w0[3], offset); w5[1] = amd_bytealign (w0[1], w0[2], offset); w5[0] = amd_bytealign (w0[0], w0[1], offset); w4[3] = amd_bytealign ( 0, w0[0], offset); w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 20: c5[0] = amd_bytealign (w7[3], 0, offset); c4[3] = amd_bytealign (w7[2], w7[3], offset); c4[2] = amd_bytealign (w7[1], w7[2], offset); c4[1] = amd_bytealign (w7[0], w7[1], offset); c4[0] = amd_bytealign (w6[3], w7[0], offset); c3[3] = amd_bytealign (w6[2], w6[3], offset); c3[2] = amd_bytealign (w6[1], w6[2], offset); c3[1] = amd_bytealign (w6[0], w6[1], offset); c3[0] = amd_bytealign (w5[3], w6[0], offset); c2[3] = amd_bytealign (w5[2], w5[3], offset); c2[2] = amd_bytealign (w5[1], w5[2], offset); c2[1] = amd_bytealign (w5[0], w5[1], offset); c2[0] = amd_bytealign (w4[3], w5[0], offset); c1[3] = amd_bytealign (w4[2], w4[3], offset); c1[2] = amd_bytealign (w4[1], w4[2], offset); c1[1] = amd_bytealign (w4[0], w4[1], offset); c1[0] = amd_bytealign (w3[3], w4[0], offset); c0[3] = amd_bytealign (w3[2], w3[3], offset); c0[2] = amd_bytealign (w3[1], w3[2], offset); c0[1] = amd_bytealign (w3[0], w3[1], offset); c0[0] = amd_bytealign (w2[3], w3[0], offset); w7[3] = amd_bytealign (w2[2], w2[3], offset); w7[2] = amd_bytealign (w2[1], w2[2], offset); w7[1] = amd_bytealign (w2[0], w2[1], offset); w7[0] = amd_bytealign (w1[3], w2[0], offset); w6[3] = amd_bytealign (w1[2], w1[3], offset); w6[2] = amd_bytealign (w1[1], w1[2], offset); w6[1] = amd_bytealign (w1[0], w1[1], offset); w6[0] = amd_bytealign (w0[3], w1[0], offset); w5[3] = amd_bytealign (w0[2], w0[3], offset); w5[2] = amd_bytealign (w0[1], w0[2], offset); w5[1] = amd_bytealign (w0[0], w0[1], offset); w5[0] = amd_bytealign ( 0, w0[0], offset); w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 21: c5[1] = amd_bytealign (w7[3], 0, offset); c5[0] = amd_bytealign (w7[2], w7[3], offset); c4[3] = amd_bytealign (w7[1], w7[2], offset); c4[2] = amd_bytealign (w7[0], w7[1], offset); c4[1] = amd_bytealign (w6[3], w7[0], offset); c4[0] = amd_bytealign (w6[2], w6[3], offset); c3[3] = amd_bytealign (w6[1], w6[2], offset); c3[2] = amd_bytealign (w6[0], w6[1], offset); c3[1] = amd_bytealign (w5[3], w6[0], offset); c3[0] = amd_bytealign (w5[2], w5[3], offset); c2[3] = amd_bytealign (w5[1], w5[2], offset); c2[2] = amd_bytealign (w5[0], w5[1], offset); c2[1] = amd_bytealign (w4[3], w5[0], offset); c2[0] = amd_bytealign (w4[2], w4[3], offset); c1[3] = amd_bytealign (w4[1], w4[2], offset); c1[2] = amd_bytealign (w4[0], w4[1], offset); c1[1] = amd_bytealign (w3[3], w4[0], offset); c1[0] = amd_bytealign (w3[2], w3[3], offset); c0[3] = amd_bytealign (w3[1], w3[2], offset); c0[2] = amd_bytealign (w3[0], w3[1], offset); c0[1] = amd_bytealign (w2[3], w3[0], offset); c0[0] = amd_bytealign (w2[2], w2[3], offset); w7[3] = amd_bytealign (w2[1], w2[2], offset); w7[2] = amd_bytealign (w2[0], w2[1], offset); w7[1] = amd_bytealign (w1[3], w2[0], offset); w7[0] = amd_bytealign (w1[2], w1[3], offset); w6[3] = amd_bytealign (w1[1], w1[2], offset); w6[2] = amd_bytealign (w1[0], w1[1], offset); w6[1] = amd_bytealign (w0[3], w1[0], offset); w6[0] = amd_bytealign (w0[2], w0[3], offset); w5[3] = amd_bytealign (w0[1], w0[2], offset); w5[2] = amd_bytealign (w0[0], w0[1], offset); w5[1] = amd_bytealign ( 0, w0[0], offset); w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 22: c5[2] = amd_bytealign (w7[3], 0, offset); c5[1] = amd_bytealign (w7[2], w7[3], offset); c5[0] = amd_bytealign (w7[1], w7[2], offset); c4[3] = amd_bytealign (w7[0], w7[1], offset); c4[2] = amd_bytealign (w6[3], w7[0], offset); c4[1] = amd_bytealign (w6[2], w6[3], offset); c4[0] = amd_bytealign (w6[1], w6[2], offset); c3[3] = amd_bytealign (w6[0], w6[1], offset); c3[2] = amd_bytealign (w5[3], w6[0], offset); c3[1] = amd_bytealign (w5[2], w5[3], offset); c3[0] = amd_bytealign (w5[1], w5[2], offset); c2[3] = amd_bytealign (w5[0], w5[1], offset); c2[2] = amd_bytealign (w4[3], w5[0], offset); c2[1] = amd_bytealign (w4[2], w4[3], offset); c2[0] = amd_bytealign (w4[1], w4[2], offset); c1[3] = amd_bytealign (w4[0], w4[1], offset); c1[2] = amd_bytealign (w3[3], w4[0], offset); c1[1] = amd_bytealign (w3[2], w3[3], offset); c1[0] = amd_bytealign (w3[1], w3[2], offset); c0[3] = amd_bytealign (w3[0], w3[1], offset); c0[2] = amd_bytealign (w2[3], w3[0], offset); c0[1] = amd_bytealign (w2[2], w2[3], offset); c0[0] = amd_bytealign (w2[1], w2[2], offset); w7[3] = amd_bytealign (w2[0], w2[1], offset); w7[2] = amd_bytealign (w1[3], w2[0], offset); w7[1] = amd_bytealign (w1[2], w1[3], offset); w7[0] = amd_bytealign (w1[1], w1[2], offset); w6[3] = amd_bytealign (w1[0], w1[1], offset); w6[2] = amd_bytealign (w0[3], w1[0], offset); w6[1] = amd_bytealign (w0[2], w0[3], offset); w6[0] = amd_bytealign (w0[1], w0[2], offset); w5[3] = amd_bytealign (w0[0], w0[1], offset); w5[2] = amd_bytealign ( 0, w0[0], offset); w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 23: c5[3] = amd_bytealign (w7[3], 0, offset); c5[2] = amd_bytealign (w7[2], w7[3], offset); c5[1] = amd_bytealign (w7[1], w7[2], offset); c5[0] = amd_bytealign (w7[0], w7[1], offset); c4[3] = amd_bytealign (w6[3], w7[0], offset); c4[2] = amd_bytealign (w6[2], w6[3], offset); c4[1] = amd_bytealign (w6[1], w6[2], offset); c4[0] = amd_bytealign (w6[0], w6[1], offset); c3[3] = amd_bytealign (w5[3], w6[0], offset); c3[2] = amd_bytealign (w5[2], w5[3], offset); c3[1] = amd_bytealign (w5[1], w5[2], offset); c3[0] = amd_bytealign (w5[0], w5[1], offset); c2[3] = amd_bytealign (w4[3], w5[0], offset); c2[2] = amd_bytealign (w4[2], w4[3], offset); c2[1] = amd_bytealign (w4[1], w4[2], offset); c2[0] = amd_bytealign (w4[0], w4[1], offset); c1[3] = amd_bytealign (w3[3], w4[0], offset); c1[2] = amd_bytealign (w3[2], w3[3], offset); c1[1] = amd_bytealign (w3[1], w3[2], offset); c1[0] = amd_bytealign (w3[0], w3[1], offset); c0[3] = amd_bytealign (w2[3], w3[0], offset); c0[2] = amd_bytealign (w2[2], w2[3], offset); c0[1] = amd_bytealign (w2[1], w2[2], offset); c0[0] = amd_bytealign (w2[0], w2[1], offset); w7[3] = amd_bytealign (w1[3], w2[0], offset); w7[2] = amd_bytealign (w1[2], w1[3], offset); w7[1] = amd_bytealign (w1[1], w1[2], offset); w7[0] = amd_bytealign (w1[0], w1[1], offset); w6[3] = amd_bytealign (w0[3], w1[0], offset); w6[2] = amd_bytealign (w0[2], w0[3], offset); w6[1] = amd_bytealign (w0[1], w0[2], offset); w6[0] = amd_bytealign (w0[0], w0[1], offset); w5[3] = amd_bytealign ( 0, w0[0], offset); w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 24: c6[0] = amd_bytealign (w7[3], 0, offset); c5[3] = amd_bytealign (w7[2], w7[3], offset); c5[2] = amd_bytealign (w7[1], w7[2], offset); c5[1] = amd_bytealign (w7[0], w7[1], offset); c5[0] = amd_bytealign (w6[3], w7[0], offset); c4[3] = amd_bytealign (w6[2], w6[3], offset); c4[2] = amd_bytealign (w6[1], w6[2], offset); c4[1] = amd_bytealign (w6[0], w6[1], offset); c4[0] = amd_bytealign (w5[3], w6[0], offset); c3[3] = amd_bytealign (w5[2], w5[3], offset); c3[2] = amd_bytealign (w5[1], w5[2], offset); c3[1] = amd_bytealign (w5[0], w5[1], offset); c3[0] = amd_bytealign (w4[3], w5[0], offset); c2[3] = amd_bytealign (w4[2], w4[3], offset); c2[2] = amd_bytealign (w4[1], w4[2], offset); c2[1] = amd_bytealign (w4[0], w4[1], offset); c2[0] = amd_bytealign (w3[3], w4[0], offset); c1[3] = amd_bytealign (w3[2], w3[3], offset); c1[2] = amd_bytealign (w3[1], w3[2], offset); c1[1] = amd_bytealign (w3[0], w3[1], offset); c1[0] = amd_bytealign (w2[3], w3[0], offset); c0[3] = amd_bytealign (w2[2], w2[3], offset); c0[2] = amd_bytealign (w2[1], w2[2], offset); c0[1] = amd_bytealign (w2[0], w2[1], offset); c0[0] = amd_bytealign (w1[3], w2[0], offset); w7[3] = amd_bytealign (w1[2], w1[3], offset); w7[2] = amd_bytealign (w1[1], w1[2], offset); w7[1] = amd_bytealign (w1[0], w1[1], offset); w7[0] = amd_bytealign (w0[3], w1[0], offset); w6[3] = amd_bytealign (w0[2], w0[3], offset); w6[2] = amd_bytealign (w0[1], w0[2], offset); w6[1] = amd_bytealign (w0[0], w0[1], offset); w6[0] = amd_bytealign ( 0, w0[0], offset); w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 25: c6[1] = amd_bytealign (w7[3], 0, offset); c6[0] = amd_bytealign (w7[2], w7[3], offset); c5[3] = amd_bytealign (w7[1], w7[2], offset); c5[2] = amd_bytealign (w7[0], w7[1], offset); c5[1] = amd_bytealign (w6[3], w7[0], offset); c5[0] = amd_bytealign (w6[2], w6[3], offset); c4[3] = amd_bytealign (w6[1], w6[2], offset); c4[2] = amd_bytealign (w6[0], w6[1], offset); c4[1] = amd_bytealign (w5[3], w6[0], offset); c4[0] = amd_bytealign (w5[2], w5[3], offset); c3[3] = amd_bytealign (w5[1], w5[2], offset); c3[2] = amd_bytealign (w5[0], w5[1], offset); c3[1] = amd_bytealign (w4[3], w5[0], offset); c3[0] = amd_bytealign (w4[2], w4[3], offset); c2[3] = amd_bytealign (w4[1], w4[2], offset); c2[2] = amd_bytealign (w4[0], w4[1], offset); c2[1] = amd_bytealign (w3[3], w4[0], offset); c2[0] = amd_bytealign (w3[2], w3[3], offset); c1[3] = amd_bytealign (w3[1], w3[2], offset); c1[2] = amd_bytealign (w3[0], w3[1], offset); c1[1] = amd_bytealign (w2[3], w3[0], offset); c1[0] = amd_bytealign (w2[2], w2[3], offset); c0[3] = amd_bytealign (w2[1], w2[2], offset); c0[2] = amd_bytealign (w2[0], w2[1], offset); c0[1] = amd_bytealign (w1[3], w2[0], offset); c0[0] = amd_bytealign (w1[2], w1[3], offset); w7[3] = amd_bytealign (w1[1], w1[2], offset); w7[2] = amd_bytealign (w1[0], w1[1], offset); w7[1] = amd_bytealign (w0[3], w1[0], offset); w7[0] = amd_bytealign (w0[2], w0[3], offset); w6[3] = amd_bytealign (w0[1], w0[2], offset); w6[2] = amd_bytealign (w0[0], w0[1], offset); w6[1] = amd_bytealign ( 0, w0[0], offset); w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 26: c6[2] = amd_bytealign (w7[3], 0, offset); c6[1] = amd_bytealign (w7[2], w7[3], offset); c6[0] = amd_bytealign (w7[1], w7[2], offset); c5[3] = amd_bytealign (w7[0], w7[1], offset); c5[2] = amd_bytealign (w6[3], w7[0], offset); c5[1] = amd_bytealign (w6[2], w6[3], offset); c5[0] = amd_bytealign (w6[1], w6[2], offset); c4[3] = amd_bytealign (w6[0], w6[1], offset); c4[2] = amd_bytealign (w5[3], w6[0], offset); c4[1] = amd_bytealign (w5[2], w5[3], offset); c4[0] = amd_bytealign (w5[1], w5[2], offset); c3[3] = amd_bytealign (w5[0], w5[1], offset); c3[2] = amd_bytealign (w4[3], w5[0], offset); c3[1] = amd_bytealign (w4[2], w4[3], offset); c3[0] = amd_bytealign (w4[1], w4[2], offset); c2[3] = amd_bytealign (w4[0], w4[1], offset); c2[2] = amd_bytealign (w3[3], w4[0], offset); c2[1] = amd_bytealign (w3[2], w3[3], offset); c2[0] = amd_bytealign (w3[1], w3[2], offset); c1[3] = amd_bytealign (w3[0], w3[1], offset); c1[2] = amd_bytealign (w2[3], w3[0], offset); c1[1] = amd_bytealign (w2[2], w2[3], offset); c1[0] = amd_bytealign (w2[1], w2[2], offset); c0[3] = amd_bytealign (w2[0], w2[1], offset); c0[2] = amd_bytealign (w1[3], w2[0], offset); c0[1] = amd_bytealign (w1[2], w1[3], offset); c0[0] = amd_bytealign (w1[1], w1[2], offset); w7[3] = amd_bytealign (w1[0], w1[1], offset); w7[2] = amd_bytealign (w0[3], w1[0], offset); w7[1] = amd_bytealign (w0[2], w0[3], offset); w7[0] = amd_bytealign (w0[1], w0[2], offset); w6[3] = amd_bytealign (w0[0], w0[1], offset); w6[2] = amd_bytealign ( 0, w0[0], offset); w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 27: c6[3] = amd_bytealign (w7[3], 0, offset); c6[2] = amd_bytealign (w7[2], w7[3], offset); c6[1] = amd_bytealign (w7[1], w7[2], offset); c6[0] = amd_bytealign (w7[0], w7[1], offset); c5[3] = amd_bytealign (w6[3], w7[0], offset); c5[2] = amd_bytealign (w6[2], w6[3], offset); c5[1] = amd_bytealign (w6[1], w6[2], offset); c5[0] = amd_bytealign (w6[0], w6[1], offset); c4[3] = amd_bytealign (w5[3], w6[0], offset); c4[2] = amd_bytealign (w5[2], w5[3], offset); c4[1] = amd_bytealign (w5[1], w5[2], offset); c4[0] = amd_bytealign (w5[0], w5[1], offset); c3[3] = amd_bytealign (w4[3], w5[0], offset); c3[2] = amd_bytealign (w4[2], w4[3], offset); c3[1] = amd_bytealign (w4[1], w4[2], offset); c3[0] = amd_bytealign (w4[0], w4[1], offset); c2[3] = amd_bytealign (w3[3], w4[0], offset); c2[2] = amd_bytealign (w3[2], w3[3], offset); c2[1] = amd_bytealign (w3[1], w3[2], offset); c2[0] = amd_bytealign (w3[0], w3[1], offset); c1[3] = amd_bytealign (w2[3], w3[0], offset); c1[2] = amd_bytealign (w2[2], w2[3], offset); c1[1] = amd_bytealign (w2[1], w2[2], offset); c1[0] = amd_bytealign (w2[0], w2[1], offset); c0[3] = amd_bytealign (w1[3], w2[0], offset); c0[2] = amd_bytealign (w1[2], w1[3], offset); c0[1] = amd_bytealign (w1[1], w1[2], offset); c0[0] = amd_bytealign (w1[0], w1[1], offset); w7[3] = amd_bytealign (w0[3], w1[0], offset); w7[2] = amd_bytealign (w0[2], w0[3], offset); w7[1] = amd_bytealign (w0[1], w0[2], offset); w7[0] = amd_bytealign (w0[0], w0[1], offset); w6[3] = amd_bytealign ( 0, w0[0], offset); w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 28: c7[0] = amd_bytealign (w7[3], 0, offset); c6[3] = amd_bytealign (w7[2], w7[3], offset); c6[2] = amd_bytealign (w7[1], w7[2], offset); c6[1] = amd_bytealign (w7[0], w7[1], offset); c6[0] = amd_bytealign (w6[3], w7[0], offset); c5[3] = amd_bytealign (w6[2], w6[3], offset); c5[2] = amd_bytealign (w6[1], w6[2], offset); c5[1] = amd_bytealign (w6[0], w6[1], offset); c5[0] = amd_bytealign (w5[3], w6[0], offset); c4[3] = amd_bytealign (w5[2], w5[3], offset); c4[2] = amd_bytealign (w5[1], w5[2], offset); c4[1] = amd_bytealign (w5[0], w5[1], offset); c4[0] = amd_bytealign (w4[3], w5[0], offset); c3[3] = amd_bytealign (w4[2], w4[3], offset); c3[2] = amd_bytealign (w4[1], w4[2], offset); c3[1] = amd_bytealign (w4[0], w4[1], offset); c3[0] = amd_bytealign (w3[3], w4[0], offset); c2[3] = amd_bytealign (w3[2], w3[3], offset); c2[2] = amd_bytealign (w3[1], w3[2], offset); c2[1] = amd_bytealign (w3[0], w3[1], offset); c2[0] = amd_bytealign (w2[3], w3[0], offset); c1[3] = amd_bytealign (w2[2], w2[3], offset); c1[2] = amd_bytealign (w2[1], w2[2], offset); c1[1] = amd_bytealign (w2[0], w2[1], offset); c1[0] = amd_bytealign (w1[3], w2[0], offset); c0[3] = amd_bytealign (w1[2], w1[3], offset); c0[2] = amd_bytealign (w1[1], w1[2], offset); c0[1] = amd_bytealign (w1[0], w1[1], offset); c0[0] = amd_bytealign (w0[3], w1[0], offset); w7[3] = amd_bytealign (w0[2], w0[3], offset); w7[2] = amd_bytealign (w0[1], w0[2], offset); w7[1] = amd_bytealign (w0[0], w0[1], offset); w7[0] = amd_bytealign ( 0, w0[0], offset); w6[3] = 0; w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 29: c7[1] = amd_bytealign (w7[3], 0, offset); c7[0] = amd_bytealign (w7[2], w7[3], offset); c6[3] = amd_bytealign (w7[1], w7[2], offset); c6[2] = amd_bytealign (w7[0], w7[1], offset); c6[1] = amd_bytealign (w6[3], w7[0], offset); c6[0] = amd_bytealign (w6[2], w6[3], offset); c5[3] = amd_bytealign (w6[1], w6[2], offset); c5[2] = amd_bytealign (w6[0], w6[1], offset); c5[1] = amd_bytealign (w5[3], w6[0], offset); c5[0] = amd_bytealign (w5[2], w5[3], offset); c4[3] = amd_bytealign (w5[1], w5[2], offset); c4[2] = amd_bytealign (w5[0], w5[1], offset); c4[1] = amd_bytealign (w4[3], w5[0], offset); c4[0] = amd_bytealign (w4[2], w4[3], offset); c3[3] = amd_bytealign (w4[1], w4[2], offset); c3[2] = amd_bytealign (w4[0], w4[1], offset); c3[1] = amd_bytealign (w3[3], w4[0], offset); c3[0] = amd_bytealign (w3[2], w3[3], offset); c2[3] = amd_bytealign (w3[1], w3[2], offset); c2[2] = amd_bytealign (w3[0], w3[1], offset); c2[1] = amd_bytealign (w2[3], w3[0], offset); c2[0] = amd_bytealign (w2[2], w2[3], offset); c1[3] = amd_bytealign (w2[1], w2[2], offset); c1[2] = amd_bytealign (w2[0], w2[1], offset); c1[1] = amd_bytealign (w1[3], w2[0], offset); c1[0] = amd_bytealign (w1[2], w1[3], offset); c0[3] = amd_bytealign (w1[1], w1[2], offset); c0[2] = amd_bytealign (w1[0], w1[1], offset); c0[1] = amd_bytealign (w0[3], w1[0], offset); c0[0] = amd_bytealign (w0[2], w0[3], offset); w7[3] = amd_bytealign (w0[1], w0[2], offset); w7[2] = amd_bytealign (w0[0], w0[1], offset); w7[1] = amd_bytealign ( 0, w0[0], offset); w7[0] = 0; w6[3] = 0; w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 30: c7[2] = amd_bytealign (w7[3], 0, offset); c7[1] = amd_bytealign (w7[2], w7[3], offset); c7[0] = amd_bytealign (w7[1], w7[2], offset); c6[3] = amd_bytealign (w7[0], w7[1], offset); c6[2] = amd_bytealign (w6[3], w7[0], offset); c6[1] = amd_bytealign (w6[2], w6[3], offset); c6[0] = amd_bytealign (w6[1], w6[2], offset); c5[3] = amd_bytealign (w6[0], w6[1], offset); c5[2] = amd_bytealign (w5[3], w6[0], offset); c5[1] = amd_bytealign (w5[2], w5[3], offset); c5[0] = amd_bytealign (w5[1], w5[2], offset); c4[3] = amd_bytealign (w5[0], w5[1], offset); c4[2] = amd_bytealign (w4[3], w5[0], offset); c4[1] = amd_bytealign (w4[2], w4[3], offset); c4[0] = amd_bytealign (w4[1], w4[2], offset); c3[3] = amd_bytealign (w4[0], w4[1], offset); c3[2] = amd_bytealign (w3[3], w4[0], offset); c3[1] = amd_bytealign (w3[2], w3[3], offset); c3[0] = amd_bytealign (w3[1], w3[2], offset); c2[3] = amd_bytealign (w3[0], w3[1], offset); c2[2] = amd_bytealign (w2[3], w3[0], offset); c2[1] = amd_bytealign (w2[2], w2[3], offset); c2[0] = amd_bytealign (w2[1], w2[2], offset); c1[3] = amd_bytealign (w2[0], w2[1], offset); c1[2] = amd_bytealign (w1[3], w2[0], offset); c1[1] = amd_bytealign (w1[2], w1[3], offset); c1[0] = amd_bytealign (w1[1], w1[2], offset); c0[3] = amd_bytealign (w1[0], w1[1], offset); c0[2] = amd_bytealign (w0[3], w1[0], offset); c0[1] = amd_bytealign (w0[2], w0[3], offset); c0[0] = amd_bytealign (w0[1], w0[2], offset); w7[3] = amd_bytealign (w0[0], w0[1], offset); w7[2] = amd_bytealign ( 0, w0[0], offset); w7[1] = 0; w7[0] = 0; w6[3] = 0; w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 31: c7[3] = amd_bytealign (w7[3], 0, offset); c7[2] = amd_bytealign (w7[2], w7[3], offset); c7[1] = amd_bytealign (w7[1], w7[2], offset); c7[0] = amd_bytealign (w7[0], w7[1], offset); c6[3] = amd_bytealign (w6[3], w7[0], offset); c6[2] = amd_bytealign (w6[2], w6[3], offset); c6[1] = amd_bytealign (w6[1], w6[2], offset); c6[0] = amd_bytealign (w6[0], w6[1], offset); c5[3] = amd_bytealign (w5[3], w6[0], offset); c5[2] = amd_bytealign (w5[2], w5[3], offset); c5[1] = amd_bytealign (w5[1], w5[2], offset); c5[0] = amd_bytealign (w5[0], w5[1], offset); c4[3] = amd_bytealign (w4[3], w5[0], offset); c4[2] = amd_bytealign (w4[2], w4[3], offset); c4[1] = amd_bytealign (w4[1], w4[2], offset); c4[0] = amd_bytealign (w4[0], w4[1], offset); c3[3] = amd_bytealign (w3[3], w4[0], offset); c3[2] = amd_bytealign (w3[2], w3[3], offset); c3[1] = amd_bytealign (w3[1], w3[2], offset); c3[0] = amd_bytealign (w3[0], w3[1], offset); c2[3] = amd_bytealign (w2[3], w3[0], offset); c2[2] = amd_bytealign (w2[2], w2[3], offset); c2[1] = amd_bytealign (w2[1], w2[2], offset); c2[0] = amd_bytealign (w2[0], w2[1], offset); c1[3] = amd_bytealign (w1[3], w2[0], offset); c1[2] = amd_bytealign (w1[2], w1[3], offset); c1[1] = amd_bytealign (w1[1], w1[2], offset); c1[0] = amd_bytealign (w1[0], w1[1], offset); c0[3] = amd_bytealign (w0[3], w1[0], offset); c0[2] = amd_bytealign (w0[2], w0[3], offset); c0[1] = amd_bytealign (w0[1], w0[2], offset); c0[0] = amd_bytealign (w0[0], w0[1], offset); w7[3] = amd_bytealign ( 0, w0[0], offset); w7[2] = 0; w7[1] = 0; w7[0] = 0; w6[3] = 0; w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; } #endif #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV #if defined IS_NV const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; #endif #if defined IS_AMD const int selector = 0x0706050403020100 >> ((offset & 3) * 8); #endif switch (offset_switch) { case 0: c0[0] = __byte_perm ( 0, w7[3], selector); w7[3] = __byte_perm (w7[3], w7[2], selector); w7[2] = __byte_perm (w7[2], w7[1], selector); w7[1] = __byte_perm (w7[1], w7[0], selector); w7[0] = __byte_perm (w7[0], w6[3], selector); w6[3] = __byte_perm (w6[3], w6[2], selector); w6[2] = __byte_perm (w6[2], w6[1], selector); w6[1] = __byte_perm (w6[1], w6[0], selector); w6[0] = __byte_perm (w6[0], w5[3], selector); w5[3] = __byte_perm (w5[3], w5[2], selector); w5[2] = __byte_perm (w5[2], w5[1], selector); w5[1] = __byte_perm (w5[1], w5[0], selector); w5[0] = __byte_perm (w5[0], w4[3], selector); w4[3] = __byte_perm (w4[3], w4[2], selector); w4[2] = __byte_perm (w4[2], w4[1], selector); w4[1] = __byte_perm (w4[1], w4[0], selector); w4[0] = __byte_perm (w4[0], w3[3], selector); w3[3] = __byte_perm (w3[3], w3[2], selector); w3[2] = __byte_perm (w3[2], w3[1], selector); w3[1] = __byte_perm (w3[1], w3[0], selector); w3[0] = __byte_perm (w3[0], w2[3], selector); w2[3] = __byte_perm (w2[3], w2[2], selector); w2[2] = __byte_perm (w2[2], w2[1], selector); w2[1] = __byte_perm (w2[1], w2[0], selector); w2[0] = __byte_perm (w2[0], w1[3], selector); w1[3] = __byte_perm (w1[3], w1[2], selector); w1[2] = __byte_perm (w1[2], w1[1], selector); w1[1] = __byte_perm (w1[1], w1[0], selector); w1[0] = __byte_perm (w1[0], w0[3], selector); w0[3] = __byte_perm (w0[3], w0[2], selector); w0[2] = __byte_perm (w0[2], w0[1], selector); w0[1] = __byte_perm (w0[1], w0[0], selector); w0[0] = __byte_perm (w0[0], 0, selector); break; case 1: c0[1] = __byte_perm ( 0, w7[3], selector); c0[0] = __byte_perm (w7[3], w7[2], selector); w7[3] = __byte_perm (w7[2], w7[1], selector); w7[2] = __byte_perm (w7[1], w7[0], selector); w7[1] = __byte_perm (w7[0], w6[3], selector); w7[0] = __byte_perm (w6[3], w6[2], selector); w6[3] = __byte_perm (w6[2], w6[1], selector); w6[2] = __byte_perm (w6[1], w6[0], selector); w6[1] = __byte_perm (w6[0], w5[3], selector); w6[0] = __byte_perm (w5[3], w5[2], selector); w5[3] = __byte_perm (w5[2], w5[1], selector); w5[2] = __byte_perm (w5[1], w5[0], selector); w5[1] = __byte_perm (w5[0], w4[3], selector); w5[0] = __byte_perm (w4[3], w4[2], selector); w4[3] = __byte_perm (w4[2], w4[1], selector); w4[2] = __byte_perm (w4[1], w4[0], selector); w4[1] = __byte_perm (w4[0], w3[3], selector); w4[0] = __byte_perm (w3[3], w3[2], selector); w3[3] = __byte_perm (w3[2], w3[1], selector); w3[2] = __byte_perm (w3[1], w3[0], selector); w3[1] = __byte_perm (w3[0], w2[3], selector); w3[0] = __byte_perm (w2[3], w2[2], selector); w2[3] = __byte_perm (w2[2], w2[1], selector); w2[2] = __byte_perm (w2[1], w2[0], selector); w2[1] = __byte_perm (w2[0], w1[3], selector); w2[0] = __byte_perm (w1[3], w1[2], selector); w1[3] = __byte_perm (w1[2], w1[1], selector); w1[2] = __byte_perm (w1[1], w1[0], selector); w1[1] = __byte_perm (w1[0], w0[3], selector); w1[0] = __byte_perm (w0[3], w0[2], selector); w0[3] = __byte_perm (w0[2], w0[1], selector); w0[2] = __byte_perm (w0[1], w0[0], selector); w0[1] = __byte_perm (w0[0], 0, selector); w0[0] = 0; break; case 2: c0[2] = __byte_perm ( 0, w7[3], selector); c0[1] = __byte_perm (w7[3], w7[2], selector); c0[0] = __byte_perm (w7[2], w7[1], selector); w7[3] = __byte_perm (w7[1], w7[0], selector); w7[2] = __byte_perm (w7[0], w6[3], selector); w7[1] = __byte_perm (w6[3], w6[2], selector); w7[0] = __byte_perm (w6[2], w6[1], selector); w6[3] = __byte_perm (w6[1], w6[0], selector); w6[2] = __byte_perm (w6[0], w5[3], selector); w6[1] = __byte_perm (w5[3], w5[2], selector); w6[0] = __byte_perm (w5[2], w5[1], selector); w5[3] = __byte_perm (w5[1], w5[0], selector); w5[2] = __byte_perm (w5[0], w4[3], selector); w5[1] = __byte_perm (w4[3], w4[2], selector); w5[0] = __byte_perm (w4[2], w4[1], selector); w4[3] = __byte_perm (w4[1], w4[0], selector); w4[2] = __byte_perm (w4[0], w3[3], selector); w4[1] = __byte_perm (w3[3], w3[2], selector); w4[0] = __byte_perm (w3[2], w3[1], selector); w3[3] = __byte_perm (w3[1], w3[0], selector); w3[2] = __byte_perm (w3[0], w2[3], selector); w3[1] = __byte_perm (w2[3], w2[2], selector); w3[0] = __byte_perm (w2[2], w2[1], selector); w2[3] = __byte_perm (w2[1], w2[0], selector); w2[2] = __byte_perm (w2[0], w1[3], selector); w2[1] = __byte_perm (w1[3], w1[2], selector); w2[0] = __byte_perm (w1[2], w1[1], selector); w1[3] = __byte_perm (w1[1], w1[0], selector); w1[2] = __byte_perm (w1[0], w0[3], selector); w1[1] = __byte_perm (w0[3], w0[2], selector); w1[0] = __byte_perm (w0[2], w0[1], selector); w0[3] = __byte_perm (w0[1], w0[0], selector); w0[2] = __byte_perm (w0[0], 0, selector); w0[1] = 0; w0[0] = 0; break; case 3: c0[3] = __byte_perm ( 0, w7[3], selector); c0[2] = __byte_perm (w7[3], w7[2], selector); c0[1] = __byte_perm (w7[2], w7[1], selector); c0[0] = __byte_perm (w7[1], w7[0], selector); w7[3] = __byte_perm (w7[0], w6[3], selector); w7[2] = __byte_perm (w6[3], w6[2], selector); w7[1] = __byte_perm (w6[2], w6[1], selector); w7[0] = __byte_perm (w6[1], w6[0], selector); w6[3] = __byte_perm (w6[0], w5[3], selector); w6[2] = __byte_perm (w5[3], w5[2], selector); w6[1] = __byte_perm (w5[2], w5[1], selector); w6[0] = __byte_perm (w5[1], w5[0], selector); w5[3] = __byte_perm (w5[0], w4[3], selector); w5[2] = __byte_perm (w4[3], w4[2], selector); w5[1] = __byte_perm (w4[2], w4[1], selector); w5[0] = __byte_perm (w4[1], w4[0], selector); w4[3] = __byte_perm (w4[0], w3[3], selector); w4[2] = __byte_perm (w3[3], w3[2], selector); w4[1] = __byte_perm (w3[2], w3[1], selector); w4[0] = __byte_perm (w3[1], w3[0], selector); w3[3] = __byte_perm (w3[0], w2[3], selector); w3[2] = __byte_perm (w2[3], w2[2], selector); w3[1] = __byte_perm (w2[2], w2[1], selector); w3[0] = __byte_perm (w2[1], w2[0], selector); w2[3] = __byte_perm (w2[0], w1[3], selector); w2[2] = __byte_perm (w1[3], w1[2], selector); w2[1] = __byte_perm (w1[2], w1[1], selector); w2[0] = __byte_perm (w1[1], w1[0], selector); w1[3] = __byte_perm (w1[0], w0[3], selector); w1[2] = __byte_perm (w0[3], w0[2], selector); w1[1] = __byte_perm (w0[2], w0[1], selector); w1[0] = __byte_perm (w0[1], w0[0], selector); w0[3] = __byte_perm (w0[0], 0, selector); w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 4: c1[0] = __byte_perm ( 0, w7[3], selector); c0[3] = __byte_perm (w7[3], w7[2], selector); c0[2] = __byte_perm (w7[2], w7[1], selector); c0[1] = __byte_perm (w7[1], w7[0], selector); c0[0] = __byte_perm (w7[0], w6[3], selector); w7[3] = __byte_perm (w6[3], w6[2], selector); w7[2] = __byte_perm (w6[2], w6[1], selector); w7[1] = __byte_perm (w6[1], w6[0], selector); w7[0] = __byte_perm (w6[0], w5[3], selector); w6[3] = __byte_perm (w5[3], w5[2], selector); w6[2] = __byte_perm (w5[2], w5[1], selector); w6[1] = __byte_perm (w5[1], w5[0], selector); w6[0] = __byte_perm (w5[0], w4[3], selector); w5[3] = __byte_perm (w4[3], w4[2], selector); w5[2] = __byte_perm (w4[2], w4[1], selector); w5[1] = __byte_perm (w4[1], w4[0], selector); w5[0] = __byte_perm (w4[0], w3[3], selector); w4[3] = __byte_perm (w3[3], w3[2], selector); w4[2] = __byte_perm (w3[2], w3[1], selector); w4[1] = __byte_perm (w3[1], w3[0], selector); w4[0] = __byte_perm (w3[0], w2[3], selector); w3[3] = __byte_perm (w2[3], w2[2], selector); w3[2] = __byte_perm (w2[2], w2[1], selector); w3[1] = __byte_perm (w2[1], w2[0], selector); w3[0] = __byte_perm (w2[0], w1[3], selector); w2[3] = __byte_perm (w1[3], w1[2], selector); w2[2] = __byte_perm (w1[2], w1[1], selector); w2[1] = __byte_perm (w1[1], w1[0], selector); w2[0] = __byte_perm (w1[0], w0[3], selector); w1[3] = __byte_perm (w0[3], w0[2], selector); w1[2] = __byte_perm (w0[2], w0[1], selector); w1[1] = __byte_perm (w0[1], w0[0], selector); w1[0] = __byte_perm (w0[0], 0, selector); w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 5: c1[1] = __byte_perm ( 0, w7[3], selector); c1[0] = __byte_perm (w7[3], w7[2], selector); c0[3] = __byte_perm (w7[2], w7[1], selector); c0[2] = __byte_perm (w7[1], w7[0], selector); c0[1] = __byte_perm (w7[0], w6[3], selector); c0[0] = __byte_perm (w6[3], w6[2], selector); w7[3] = __byte_perm (w6[2], w6[1], selector); w7[2] = __byte_perm (w6[1], w6[0], selector); w7[1] = __byte_perm (w6[0], w5[3], selector); w7[0] = __byte_perm (w5[3], w5[2], selector); w6[3] = __byte_perm (w5[2], w5[1], selector); w6[2] = __byte_perm (w5[1], w5[0], selector); w6[1] = __byte_perm (w5[0], w4[3], selector); w6[0] = __byte_perm (w4[3], w4[2], selector); w5[3] = __byte_perm (w4[2], w4[1], selector); w5[2] = __byte_perm (w4[1], w4[0], selector); w5[1] = __byte_perm (w4[0], w3[3], selector); w5[0] = __byte_perm (w3[3], w3[2], selector); w4[3] = __byte_perm (w3[2], w3[1], selector); w4[2] = __byte_perm (w3[1], w3[0], selector); w4[1] = __byte_perm (w3[0], w2[3], selector); w4[0] = __byte_perm (w2[3], w2[2], selector); w3[3] = __byte_perm (w2[2], w2[1], selector); w3[2] = __byte_perm (w2[1], w2[0], selector); w3[1] = __byte_perm (w2[0], w1[3], selector); w3[0] = __byte_perm (w1[3], w1[2], selector); w2[3] = __byte_perm (w1[2], w1[1], selector); w2[2] = __byte_perm (w1[1], w1[0], selector); w2[1] = __byte_perm (w1[0], w0[3], selector); w2[0] = __byte_perm (w0[3], w0[2], selector); w1[3] = __byte_perm (w0[2], w0[1], selector); w1[2] = __byte_perm (w0[1], w0[0], selector); w1[1] = __byte_perm (w0[0], 0, selector); w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 6: c1[2] = __byte_perm ( 0, w7[3], selector); c1[1] = __byte_perm (w7[3], w7[2], selector); c1[0] = __byte_perm (w7[2], w7[1], selector); c0[3] = __byte_perm (w7[1], w7[0], selector); c0[2] = __byte_perm (w7[0], w6[3], selector); c0[1] = __byte_perm (w6[3], w6[2], selector); c0[0] = __byte_perm (w6[2], w6[1], selector); w7[3] = __byte_perm (w6[1], w6[0], selector); w7[2] = __byte_perm (w6[0], w5[3], selector); w7[1] = __byte_perm (w5[3], w5[2], selector); w7[0] = __byte_perm (w5[2], w5[1], selector); w6[3] = __byte_perm (w5[1], w5[0], selector); w6[2] = __byte_perm (w5[0], w4[3], selector); w6[1] = __byte_perm (w4[3], w4[2], selector); w6[0] = __byte_perm (w4[2], w4[1], selector); w5[3] = __byte_perm (w4[1], w4[0], selector); w5[2] = __byte_perm (w4[0], w3[3], selector); w5[1] = __byte_perm (w3[3], w3[2], selector); w5[0] = __byte_perm (w3[2], w3[1], selector); w4[3] = __byte_perm (w3[1], w3[0], selector); w4[2] = __byte_perm (w3[0], w2[3], selector); w4[1] = __byte_perm (w2[3], w2[2], selector); w4[0] = __byte_perm (w2[2], w2[1], selector); w3[3] = __byte_perm (w2[1], w2[0], selector); w3[2] = __byte_perm (w2[0], w1[3], selector); w3[1] = __byte_perm (w1[3], w1[2], selector); w3[0] = __byte_perm (w1[2], w1[1], selector); w2[3] = __byte_perm (w1[1], w1[0], selector); w2[2] = __byte_perm (w1[0], w0[3], selector); w2[1] = __byte_perm (w0[3], w0[2], selector); w2[0] = __byte_perm (w0[2], w0[1], selector); w1[3] = __byte_perm (w0[1], w0[0], selector); w1[2] = __byte_perm (w0[0], 0, selector); w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 7: c1[3] = __byte_perm ( 0, w7[3], selector); c1[2] = __byte_perm (w7[3], w7[2], selector); c1[1] = __byte_perm (w7[2], w7[1], selector); c1[0] = __byte_perm (w7[1], w7[0], selector); c0[3] = __byte_perm (w7[0], w6[3], selector); c0[2] = __byte_perm (w6[3], w6[2], selector); c0[1] = __byte_perm (w6[2], w6[1], selector); c0[0] = __byte_perm (w6[1], w6[0], selector); w7[3] = __byte_perm (w6[0], w5[3], selector); w7[2] = __byte_perm (w5[3], w5[2], selector); w7[1] = __byte_perm (w5[2], w5[1], selector); w7[0] = __byte_perm (w5[1], w5[0], selector); w6[3] = __byte_perm (w5[0], w4[3], selector); w6[2] = __byte_perm (w4[3], w4[2], selector); w6[1] = __byte_perm (w4[2], w4[1], selector); w6[0] = __byte_perm (w4[1], w4[0], selector); w5[3] = __byte_perm (w4[0], w3[3], selector); w5[2] = __byte_perm (w3[3], w3[2], selector); w5[1] = __byte_perm (w3[2], w3[1], selector); w5[0] = __byte_perm (w3[1], w3[0], selector); w4[3] = __byte_perm (w3[0], w2[3], selector); w4[2] = __byte_perm (w2[3], w2[2], selector); w4[1] = __byte_perm (w2[2], w2[1], selector); w4[0] = __byte_perm (w2[1], w2[0], selector); w3[3] = __byte_perm (w2[0], w1[3], selector); w3[2] = __byte_perm (w1[3], w1[2], selector); w3[1] = __byte_perm (w1[2], w1[1], selector); w3[0] = __byte_perm (w1[1], w1[0], selector); w2[3] = __byte_perm (w1[0], w0[3], selector); w2[2] = __byte_perm (w0[3], w0[2], selector); w2[1] = __byte_perm (w0[2], w0[1], selector); w2[0] = __byte_perm (w0[1], w0[0], selector); w1[3] = __byte_perm (w0[0], 0, selector); w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 8: c2[0] = __byte_perm ( 0, w7[3], selector); c1[3] = __byte_perm (w7[3], w7[2], selector); c1[2] = __byte_perm (w7[2], w7[1], selector); c1[1] = __byte_perm (w7[1], w7[0], selector); c1[0] = __byte_perm (w7[0], w6[3], selector); c0[3] = __byte_perm (w6[3], w6[2], selector); c0[2] = __byte_perm (w6[2], w6[1], selector); c0[1] = __byte_perm (w6[1], w6[0], selector); c0[0] = __byte_perm (w6[0], w5[3], selector); w7[3] = __byte_perm (w5[3], w5[2], selector); w7[2] = __byte_perm (w5[2], w5[1], selector); w7[1] = __byte_perm (w5[1], w5[0], selector); w7[0] = __byte_perm (w5[0], w4[3], selector); w6[3] = __byte_perm (w4[3], w4[2], selector); w6[2] = __byte_perm (w4[2], w4[1], selector); w6[1] = __byte_perm (w4[1], w4[0], selector); w6[0] = __byte_perm (w4[0], w3[3], selector); w5[3] = __byte_perm (w3[3], w3[2], selector); w5[2] = __byte_perm (w3[2], w3[1], selector); w5[1] = __byte_perm (w3[1], w3[0], selector); w5[0] = __byte_perm (w3[0], w2[3], selector); w4[3] = __byte_perm (w2[3], w2[2], selector); w4[2] = __byte_perm (w2[2], w2[1], selector); w4[1] = __byte_perm (w2[1], w2[0], selector); w4[0] = __byte_perm (w2[0], w1[3], selector); w3[3] = __byte_perm (w1[3], w1[2], selector); w3[2] = __byte_perm (w1[2], w1[1], selector); w3[1] = __byte_perm (w1[1], w1[0], selector); w3[0] = __byte_perm (w1[0], w0[3], selector); w2[3] = __byte_perm (w0[3], w0[2], selector); w2[2] = __byte_perm (w0[2], w0[1], selector); w2[1] = __byte_perm (w0[1], w0[0], selector); w2[0] = __byte_perm (w0[0], 0, selector); w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 9: c2[1] = __byte_perm ( 0, w7[3], selector); c2[0] = __byte_perm (w7[3], w7[2], selector); c1[3] = __byte_perm (w7[2], w7[1], selector); c1[2] = __byte_perm (w7[1], w7[0], selector); c1[1] = __byte_perm (w7[0], w6[3], selector); c1[0] = __byte_perm (w6[3], w6[2], selector); c0[3] = __byte_perm (w6[2], w6[1], selector); c0[2] = __byte_perm (w6[1], w6[0], selector); c0[1] = __byte_perm (w6[0], w5[3], selector); c0[0] = __byte_perm (w5[3], w5[2], selector); w7[3] = __byte_perm (w5[2], w5[1], selector); w7[2] = __byte_perm (w5[1], w5[0], selector); w7[1] = __byte_perm (w5[0], w4[3], selector); w7[0] = __byte_perm (w4[3], w4[2], selector); w6[3] = __byte_perm (w4[2], w4[1], selector); w6[2] = __byte_perm (w4[1], w4[0], selector); w6[1] = __byte_perm (w4[0], w3[3], selector); w6[0] = __byte_perm (w3[3], w3[2], selector); w5[3] = __byte_perm (w3[2], w3[1], selector); w5[2] = __byte_perm (w3[1], w3[0], selector); w5[1] = __byte_perm (w3[0], w2[3], selector); w5[0] = __byte_perm (w2[3], w2[2], selector); w4[3] = __byte_perm (w2[2], w2[1], selector); w4[2] = __byte_perm (w2[1], w2[0], selector); w4[1] = __byte_perm (w2[0], w1[3], selector); w4[0] = __byte_perm (w1[3], w1[2], selector); w3[3] = __byte_perm (w1[2], w1[1], selector); w3[2] = __byte_perm (w1[1], w1[0], selector); w3[1] = __byte_perm (w1[0], w0[3], selector); w3[0] = __byte_perm (w0[3], w0[2], selector); w2[3] = __byte_perm (w0[2], w0[1], selector); w2[2] = __byte_perm (w0[1], w0[0], selector); w2[1] = __byte_perm (w0[0], 0, selector); w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 10: c2[2] = __byte_perm ( 0, w7[3], selector); c2[1] = __byte_perm (w7[3], w7[2], selector); c2[0] = __byte_perm (w7[2], w7[1], selector); c1[3] = __byte_perm (w7[1], w7[0], selector); c1[2] = __byte_perm (w7[0], w6[3], selector); c1[1] = __byte_perm (w6[3], w6[2], selector); c1[0] = __byte_perm (w6[2], w6[1], selector); c0[3] = __byte_perm (w6[1], w6[0], selector); c0[2] = __byte_perm (w6[0], w5[3], selector); c0[1] = __byte_perm (w5[3], w5[2], selector); c0[0] = __byte_perm (w5[2], w5[1], selector); w7[3] = __byte_perm (w5[1], w5[0], selector); w7[2] = __byte_perm (w5[0], w4[3], selector); w7[1] = __byte_perm (w4[3], w4[2], selector); w7[0] = __byte_perm (w4[2], w4[1], selector); w6[3] = __byte_perm (w4[1], w4[0], selector); w6[2] = __byte_perm (w4[0], w3[3], selector); w6[1] = __byte_perm (w3[3], w3[2], selector); w6[0] = __byte_perm (w3[2], w3[1], selector); w5[3] = __byte_perm (w3[1], w3[0], selector); w5[2] = __byte_perm (w3[0], w2[3], selector); w5[1] = __byte_perm (w2[3], w2[2], selector); w5[0] = __byte_perm (w2[2], w2[1], selector); w4[3] = __byte_perm (w2[1], w2[0], selector); w4[2] = __byte_perm (w2[0], w1[3], selector); w4[1] = __byte_perm (w1[3], w1[2], selector); w4[0] = __byte_perm (w1[2], w1[1], selector); w3[3] = __byte_perm (w1[1], w1[0], selector); w3[2] = __byte_perm (w1[0], w0[3], selector); w3[1] = __byte_perm (w0[3], w0[2], selector); w3[0] = __byte_perm (w0[2], w0[1], selector); w2[3] = __byte_perm (w0[1], w0[0], selector); w2[2] = __byte_perm (w0[0], 0, selector); w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 11: c2[3] = __byte_perm ( 0, w7[3], selector); c2[2] = __byte_perm (w7[3], w7[2], selector); c2[1] = __byte_perm (w7[2], w7[1], selector); c2[0] = __byte_perm (w7[1], w7[0], selector); c1[3] = __byte_perm (w7[0], w6[3], selector); c1[2] = __byte_perm (w6[3], w6[2], selector); c1[1] = __byte_perm (w6[2], w6[1], selector); c1[0] = __byte_perm (w6[1], w6[0], selector); c0[3] = __byte_perm (w6[0], w5[3], selector); c0[2] = __byte_perm (w5[3], w5[2], selector); c0[1] = __byte_perm (w5[2], w5[1], selector); c0[0] = __byte_perm (w5[1], w5[0], selector); w7[3] = __byte_perm (w5[0], w4[3], selector); w7[2] = __byte_perm (w4[3], w4[2], selector); w7[1] = __byte_perm (w4[2], w4[1], selector); w7[0] = __byte_perm (w4[1], w4[0], selector); w6[3] = __byte_perm (w4[0], w3[3], selector); w6[2] = __byte_perm (w3[3], w3[2], selector); w6[1] = __byte_perm (w3[2], w3[1], selector); w6[0] = __byte_perm (w3[1], w3[0], selector); w5[3] = __byte_perm (w3[0], w2[3], selector); w5[2] = __byte_perm (w2[3], w2[2], selector); w5[1] = __byte_perm (w2[2], w2[1], selector); w5[0] = __byte_perm (w2[1], w2[0], selector); w4[3] = __byte_perm (w2[0], w1[3], selector); w4[2] = __byte_perm (w1[3], w1[2], selector); w4[1] = __byte_perm (w1[2], w1[1], selector); w4[0] = __byte_perm (w1[1], w1[0], selector); w3[3] = __byte_perm (w1[0], w0[3], selector); w3[2] = __byte_perm (w0[3], w0[2], selector); w3[1] = __byte_perm (w0[2], w0[1], selector); w3[0] = __byte_perm (w0[1], w0[0], selector); w2[3] = __byte_perm (w0[0], 0, selector); w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 12: c3[0] = __byte_perm ( 0, w7[3], selector); c2[3] = __byte_perm (w7[3], w7[2], selector); c2[2] = __byte_perm (w7[2], w7[1], selector); c2[1] = __byte_perm (w7[1], w7[0], selector); c2[0] = __byte_perm (w7[0], w6[3], selector); c1[3] = __byte_perm (w6[3], w6[2], selector); c1[2] = __byte_perm (w6[2], w6[1], selector); c1[1] = __byte_perm (w6[1], w6[0], selector); c1[0] = __byte_perm (w6[0], w5[3], selector); c0[3] = __byte_perm (w5[3], w5[2], selector); c0[2] = __byte_perm (w5[2], w5[1], selector); c0[1] = __byte_perm (w5[1], w5[0], selector); c0[0] = __byte_perm (w5[0], w4[3], selector); w7[3] = __byte_perm (w4[3], w4[2], selector); w7[2] = __byte_perm (w4[2], w4[1], selector); w7[1] = __byte_perm (w4[1], w4[0], selector); w7[0] = __byte_perm (w4[0], w3[3], selector); w6[3] = __byte_perm (w3[3], w3[2], selector); w6[2] = __byte_perm (w3[2], w3[1], selector); w6[1] = __byte_perm (w3[1], w3[0], selector); w6[0] = __byte_perm (w3[0], w2[3], selector); w5[3] = __byte_perm (w2[3], w2[2], selector); w5[2] = __byte_perm (w2[2], w2[1], selector); w5[1] = __byte_perm (w2[1], w2[0], selector); w5[0] = __byte_perm (w2[0], w1[3], selector); w4[3] = __byte_perm (w1[3], w1[2], selector); w4[2] = __byte_perm (w1[2], w1[1], selector); w4[1] = __byte_perm (w1[1], w1[0], selector); w4[0] = __byte_perm (w1[0], w0[3], selector); w3[3] = __byte_perm (w0[3], w0[2], selector); w3[2] = __byte_perm (w0[2], w0[1], selector); w3[1] = __byte_perm (w0[1], w0[0], selector); w3[0] = __byte_perm (w0[0], 0, selector); w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 13: c3[1] = __byte_perm ( 0, w7[3], selector); c3[0] = __byte_perm (w7[3], w7[2], selector); c2[3] = __byte_perm (w7[2], w7[1], selector); c2[2] = __byte_perm (w7[1], w7[0], selector); c2[1] = __byte_perm (w7[0], w6[3], selector); c2[0] = __byte_perm (w6[3], w6[2], selector); c1[3] = __byte_perm (w6[2], w6[1], selector); c1[2] = __byte_perm (w6[1], w6[0], selector); c1[1] = __byte_perm (w6[0], w5[3], selector); c1[0] = __byte_perm (w5[3], w5[2], selector); c0[3] = __byte_perm (w5[2], w5[1], selector); c0[2] = __byte_perm (w5[1], w5[0], selector); c0[1] = __byte_perm (w5[0], w4[3], selector); c0[0] = __byte_perm (w4[3], w4[2], selector); w7[3] = __byte_perm (w4[2], w4[1], selector); w7[2] = __byte_perm (w4[1], w4[0], selector); w7[1] = __byte_perm (w4[0], w3[3], selector); w7[0] = __byte_perm (w3[3], w3[2], selector); w6[3] = __byte_perm (w3[2], w3[1], selector); w6[2] = __byte_perm (w3[1], w3[0], selector); w6[1] = __byte_perm (w3[0], w2[3], selector); w6[0] = __byte_perm (w2[3], w2[2], selector); w5[3] = __byte_perm (w2[2], w2[1], selector); w5[2] = __byte_perm (w2[1], w2[0], selector); w5[1] = __byte_perm (w2[0], w1[3], selector); w5[0] = __byte_perm (w1[3], w1[2], selector); w4[3] = __byte_perm (w1[2], w1[1], selector); w4[2] = __byte_perm (w1[1], w1[0], selector); w4[1] = __byte_perm (w1[0], w0[3], selector); w4[0] = __byte_perm (w0[3], w0[2], selector); w3[3] = __byte_perm (w0[2], w0[1], selector); w3[2] = __byte_perm (w0[1], w0[0], selector); w3[1] = __byte_perm (w0[0], 0, selector); w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 14: c3[2] = __byte_perm ( 0, w7[3], selector); c3[1] = __byte_perm (w7[3], w7[2], selector); c3[0] = __byte_perm (w7[2], w7[1], selector); c2[3] = __byte_perm (w7[1], w7[0], selector); c2[2] = __byte_perm (w7[0], w6[3], selector); c2[1] = __byte_perm (w6[3], w6[2], selector); c2[0] = __byte_perm (w6[2], w6[1], selector); c1[3] = __byte_perm (w6[1], w6[0], selector); c1[2] = __byte_perm (w6[0], w5[3], selector); c1[1] = __byte_perm (w5[3], w5[2], selector); c1[0] = __byte_perm (w5[2], w5[1], selector); c0[3] = __byte_perm (w5[1], w5[0], selector); c0[2] = __byte_perm (w5[0], w4[3], selector); c0[1] = __byte_perm (w4[3], w4[2], selector); c0[0] = __byte_perm (w4[2], w4[1], selector); w7[3] = __byte_perm (w4[1], w4[0], selector); w7[2] = __byte_perm (w4[0], w3[3], selector); w7[1] = __byte_perm (w3[3], w3[2], selector); w7[0] = __byte_perm (w3[2], w3[1], selector); w6[3] = __byte_perm (w3[1], w3[0], selector); w6[2] = __byte_perm (w3[0], w2[3], selector); w6[1] = __byte_perm (w2[3], w2[2], selector); w6[0] = __byte_perm (w2[2], w2[1], selector); w5[3] = __byte_perm (w2[1], w2[0], selector); w5[2] = __byte_perm (w2[0], w1[3], selector); w5[1] = __byte_perm (w1[3], w1[2], selector); w5[0] = __byte_perm (w1[2], w1[1], selector); w4[3] = __byte_perm (w1[1], w1[0], selector); w4[2] = __byte_perm (w1[0], w0[3], selector); w4[1] = __byte_perm (w0[3], w0[2], selector); w4[0] = __byte_perm (w0[2], w0[1], selector); w3[3] = __byte_perm (w0[1], w0[0], selector); w3[2] = __byte_perm (w0[0], 0, selector); w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 15: c3[3] = __byte_perm ( 0, w7[3], selector); c3[2] = __byte_perm (w7[3], w7[2], selector); c3[1] = __byte_perm (w7[2], w7[1], selector); c3[0] = __byte_perm (w7[1], w7[0], selector); c2[3] = __byte_perm (w7[0], w6[3], selector); c2[2] = __byte_perm (w6[3], w6[2], selector); c2[1] = __byte_perm (w6[2], w6[1], selector); c2[0] = __byte_perm (w6[1], w6[0], selector); c1[3] = __byte_perm (w6[0], w5[3], selector); c1[2] = __byte_perm (w5[3], w5[2], selector); c1[1] = __byte_perm (w5[2], w5[1], selector); c1[0] = __byte_perm (w5[1], w5[0], selector); c0[3] = __byte_perm (w5[0], w4[3], selector); c0[2] = __byte_perm (w4[3], w4[2], selector); c0[1] = __byte_perm (w4[2], w4[1], selector); c0[0] = __byte_perm (w4[1], w4[0], selector); w7[3] = __byte_perm (w4[0], w3[3], selector); w7[2] = __byte_perm (w3[3], w3[2], selector); w7[1] = __byte_perm (w3[2], w3[1], selector); w7[0] = __byte_perm (w3[1], w3[0], selector); w6[3] = __byte_perm (w3[0], w2[3], selector); w6[2] = __byte_perm (w2[3], w2[2], selector); w6[1] = __byte_perm (w2[2], w2[1], selector); w6[0] = __byte_perm (w2[1], w2[0], selector); w5[3] = __byte_perm (w2[0], w1[3], selector); w5[2] = __byte_perm (w1[3], w1[2], selector); w5[1] = __byte_perm (w1[2], w1[1], selector); w5[0] = __byte_perm (w1[1], w1[0], selector); w4[3] = __byte_perm (w1[0], w0[3], selector); w4[2] = __byte_perm (w0[3], w0[2], selector); w4[1] = __byte_perm (w0[2], w0[1], selector); w4[0] = __byte_perm (w0[1], w0[0], selector); w3[3] = __byte_perm (w0[0], 0, selector); w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 16: c4[0] = __byte_perm ( 0, w7[3], selector); c3[3] = __byte_perm (w7[3], w7[2], selector); c3[2] = __byte_perm (w7[2], w7[1], selector); c3[1] = __byte_perm (w7[1], w7[0], selector); c3[0] = __byte_perm (w7[0], w6[3], selector); c2[3] = __byte_perm (w6[3], w6[2], selector); c2[2] = __byte_perm (w6[2], w6[1], selector); c2[1] = __byte_perm (w6[1], w6[0], selector); c2[0] = __byte_perm (w6[0], w5[3], selector); c1[3] = __byte_perm (w5[3], w5[2], selector); c1[2] = __byte_perm (w5[2], w5[1], selector); c1[1] = __byte_perm (w5[1], w5[0], selector); c1[0] = __byte_perm (w5[0], w4[3], selector); c0[3] = __byte_perm (w4[3], w4[2], selector); c0[2] = __byte_perm (w4[2], w4[1], selector); c0[1] = __byte_perm (w4[1], w4[0], selector); c0[0] = __byte_perm (w4[0], w3[3], selector); w7[3] = __byte_perm (w3[3], w3[2], selector); w7[2] = __byte_perm (w3[2], w3[1], selector); w7[1] = __byte_perm (w3[1], w3[0], selector); w7[0] = __byte_perm (w3[0], w2[3], selector); w6[3] = __byte_perm (w2[3], w2[2], selector); w6[2] = __byte_perm (w2[2], w2[1], selector); w6[1] = __byte_perm (w2[1], w2[0], selector); w6[0] = __byte_perm (w2[0], w1[3], selector); w5[3] = __byte_perm (w1[3], w1[2], selector); w5[2] = __byte_perm (w1[2], w1[1], selector); w5[1] = __byte_perm (w1[1], w1[0], selector); w5[0] = __byte_perm (w1[0], w0[3], selector); w4[3] = __byte_perm (w0[3], w0[2], selector); w4[2] = __byte_perm (w0[2], w0[1], selector); w4[1] = __byte_perm (w0[1], w0[0], selector); w4[0] = __byte_perm (w0[0], 0, selector); w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 17: c4[1] = __byte_perm ( 0, w7[3], selector); c4[0] = __byte_perm (w7[3], w7[2], selector); c3[3] = __byte_perm (w7[2], w7[1], selector); c3[2] = __byte_perm (w7[1], w7[0], selector); c3[1] = __byte_perm (w7[0], w6[3], selector); c3[0] = __byte_perm (w6[3], w6[2], selector); c2[3] = __byte_perm (w6[2], w6[1], selector); c2[2] = __byte_perm (w6[1], w6[0], selector); c2[1] = __byte_perm (w6[0], w5[3], selector); c2[0] = __byte_perm (w5[3], w5[2], selector); c1[3] = __byte_perm (w5[2], w5[1], selector); c1[2] = __byte_perm (w5[1], w5[0], selector); c1[1] = __byte_perm (w5[0], w4[3], selector); c1[0] = __byte_perm (w4[3], w4[2], selector); c0[3] = __byte_perm (w4[2], w4[1], selector); c0[2] = __byte_perm (w4[1], w4[0], selector); c0[1] = __byte_perm (w4[0], w3[3], selector); c0[0] = __byte_perm (w3[3], w3[2], selector); w7[3] = __byte_perm (w3[2], w3[1], selector); w7[2] = __byte_perm (w3[1], w3[0], selector); w7[1] = __byte_perm (w3[0], w2[3], selector); w7[0] = __byte_perm (w2[3], w2[2], selector); w6[3] = __byte_perm (w2[2], w2[1], selector); w6[2] = __byte_perm (w2[1], w2[0], selector); w6[1] = __byte_perm (w2[0], w1[3], selector); w6[0] = __byte_perm (w1[3], w1[2], selector); w5[3] = __byte_perm (w1[2], w1[1], selector); w5[2] = __byte_perm (w1[1], w1[0], selector); w5[1] = __byte_perm (w1[0], w0[3], selector); w5[0] = __byte_perm (w0[3], w0[2], selector); w4[3] = __byte_perm (w0[2], w0[1], selector); w4[2] = __byte_perm (w0[1], w0[0], selector); w4[1] = __byte_perm (w0[0], 0, selector); w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 18: c4[2] = __byte_perm ( 0, w7[3], selector); c4[1] = __byte_perm (w7[3], w7[2], selector); c4[0] = __byte_perm (w7[2], w7[1], selector); c3[3] = __byte_perm (w7[1], w7[0], selector); c3[2] = __byte_perm (w7[0], w6[3], selector); c3[1] = __byte_perm (w6[3], w6[2], selector); c3[0] = __byte_perm (w6[2], w6[1], selector); c2[3] = __byte_perm (w6[1], w6[0], selector); c2[2] = __byte_perm (w6[0], w5[3], selector); c2[1] = __byte_perm (w5[3], w5[2], selector); c2[0] = __byte_perm (w5[2], w5[1], selector); c1[3] = __byte_perm (w5[1], w5[0], selector); c1[2] = __byte_perm (w5[0], w4[3], selector); c1[1] = __byte_perm (w4[3], w4[2], selector); c1[0] = __byte_perm (w4[2], w4[1], selector); c0[3] = __byte_perm (w4[1], w4[0], selector); c0[2] = __byte_perm (w4[0], w3[3], selector); c0[1] = __byte_perm (w3[3], w3[2], selector); c0[0] = __byte_perm (w3[2], w3[1], selector); w7[3] = __byte_perm (w3[1], w3[0], selector); w7[2] = __byte_perm (w3[0], w2[3], selector); w7[1] = __byte_perm (w2[3], w2[2], selector); w7[0] = __byte_perm (w2[2], w2[1], selector); w6[3] = __byte_perm (w2[1], w2[0], selector); w6[2] = __byte_perm (w2[0], w1[3], selector); w6[1] = __byte_perm (w1[3], w1[2], selector); w6[0] = __byte_perm (w1[2], w1[1], selector); w5[3] = __byte_perm (w1[1], w1[0], selector); w5[2] = __byte_perm (w1[0], w0[3], selector); w5[1] = __byte_perm (w0[3], w0[2], selector); w5[0] = __byte_perm (w0[2], w0[1], selector); w4[3] = __byte_perm (w0[1], w0[0], selector); w4[2] = __byte_perm (w0[0], 0, selector); w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 19: c4[3] = __byte_perm ( 0, w7[3], selector); c4[2] = __byte_perm (w7[3], w7[2], selector); c4[1] = __byte_perm (w7[2], w7[1], selector); c4[0] = __byte_perm (w7[1], w7[0], selector); c3[3] = __byte_perm (w7[0], w6[3], selector); c3[2] = __byte_perm (w6[3], w6[2], selector); c3[1] = __byte_perm (w6[2], w6[1], selector); c3[0] = __byte_perm (w6[1], w6[0], selector); c2[3] = __byte_perm (w6[0], w5[3], selector); c2[2] = __byte_perm (w5[3], w5[2], selector); c2[1] = __byte_perm (w5[2], w5[1], selector); c2[0] = __byte_perm (w5[1], w5[0], selector); c1[3] = __byte_perm (w5[0], w4[3], selector); c1[2] = __byte_perm (w4[3], w4[2], selector); c1[1] = __byte_perm (w4[2], w4[1], selector); c1[0] = __byte_perm (w4[1], w4[0], selector); c0[3] = __byte_perm (w4[0], w3[3], selector); c0[2] = __byte_perm (w3[3], w3[2], selector); c0[1] = __byte_perm (w3[2], w3[1], selector); c0[0] = __byte_perm (w3[1], w3[0], selector); w7[3] = __byte_perm (w3[0], w2[3], selector); w7[2] = __byte_perm (w2[3], w2[2], selector); w7[1] = __byte_perm (w2[2], w2[1], selector); w7[0] = __byte_perm (w2[1], w2[0], selector); w6[3] = __byte_perm (w2[0], w1[3], selector); w6[2] = __byte_perm (w1[3], w1[2], selector); w6[1] = __byte_perm (w1[2], w1[1], selector); w6[0] = __byte_perm (w1[1], w1[0], selector); w5[3] = __byte_perm (w1[0], w0[3], selector); w5[2] = __byte_perm (w0[3], w0[2], selector); w5[1] = __byte_perm (w0[2], w0[1], selector); w5[0] = __byte_perm (w0[1], w0[0], selector); w4[3] = __byte_perm (w0[0], 0, selector); w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 20: c5[0] = __byte_perm ( 0, w7[3], selector); c4[3] = __byte_perm (w7[3], w7[2], selector); c4[2] = __byte_perm (w7[2], w7[1], selector); c4[1] = __byte_perm (w7[1], w7[0], selector); c4[0] = __byte_perm (w7[0], w6[3], selector); c3[3] = __byte_perm (w6[3], w6[2], selector); c3[2] = __byte_perm (w6[2], w6[1], selector); c3[1] = __byte_perm (w6[1], w6[0], selector); c3[0] = __byte_perm (w6[0], w5[3], selector); c2[3] = __byte_perm (w5[3], w5[2], selector); c2[2] = __byte_perm (w5[2], w5[1], selector); c2[1] = __byte_perm (w5[1], w5[0], selector); c2[0] = __byte_perm (w5[0], w4[3], selector); c1[3] = __byte_perm (w4[3], w4[2], selector); c1[2] = __byte_perm (w4[2], w4[1], selector); c1[1] = __byte_perm (w4[1], w4[0], selector); c1[0] = __byte_perm (w4[0], w3[3], selector); c0[3] = __byte_perm (w3[3], w3[2], selector); c0[2] = __byte_perm (w3[2], w3[1], selector); c0[1] = __byte_perm (w3[1], w3[0], selector); c0[0] = __byte_perm (w3[0], w2[3], selector); w7[3] = __byte_perm (w2[3], w2[2], selector); w7[2] = __byte_perm (w2[2], w2[1], selector); w7[1] = __byte_perm (w2[1], w2[0], selector); w7[0] = __byte_perm (w2[0], w1[3], selector); w6[3] = __byte_perm (w1[3], w1[2], selector); w6[2] = __byte_perm (w1[2], w1[1], selector); w6[1] = __byte_perm (w1[1], w1[0], selector); w6[0] = __byte_perm (w1[0], w0[3], selector); w5[3] = __byte_perm (w0[3], w0[2], selector); w5[2] = __byte_perm (w0[2], w0[1], selector); w5[1] = __byte_perm (w0[1], w0[0], selector); w5[0] = __byte_perm (w0[0], 0, selector); w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 21: c5[1] = __byte_perm ( 0, w7[3], selector); c5[0] = __byte_perm (w7[3], w7[2], selector); c4[3] = __byte_perm (w7[2], w7[1], selector); c4[2] = __byte_perm (w7[1], w7[0], selector); c4[1] = __byte_perm (w7[0], w6[3], selector); c4[0] = __byte_perm (w6[3], w6[2], selector); c3[3] = __byte_perm (w6[2], w6[1], selector); c3[2] = __byte_perm (w6[1], w6[0], selector); c3[1] = __byte_perm (w6[0], w5[3], selector); c3[0] = __byte_perm (w5[3], w5[2], selector); c2[3] = __byte_perm (w5[2], w5[1], selector); c2[2] = __byte_perm (w5[1], w5[0], selector); c2[1] = __byte_perm (w5[0], w4[3], selector); c2[0] = __byte_perm (w4[3], w4[2], selector); c1[3] = __byte_perm (w4[2], w4[1], selector); c1[2] = __byte_perm (w4[1], w4[0], selector); c1[1] = __byte_perm (w4[0], w3[3], selector); c1[0] = __byte_perm (w3[3], w3[2], selector); c0[3] = __byte_perm (w3[2], w3[1], selector); c0[2] = __byte_perm (w3[1], w3[0], selector); c0[1] = __byte_perm (w3[0], w2[3], selector); c0[0] = __byte_perm (w2[3], w2[2], selector); w7[3] = __byte_perm (w2[2], w2[1], selector); w7[2] = __byte_perm (w2[1], w2[0], selector); w7[1] = __byte_perm (w2[0], w1[3], selector); w7[0] = __byte_perm (w1[3], w1[2], selector); w6[3] = __byte_perm (w1[2], w1[1], selector); w6[2] = __byte_perm (w1[1], w1[0], selector); w6[1] = __byte_perm (w1[0], w0[3], selector); w6[0] = __byte_perm (w0[3], w0[2], selector); w5[3] = __byte_perm (w0[2], w0[1], selector); w5[2] = __byte_perm (w0[1], w0[0], selector); w5[1] = __byte_perm (w0[0], 0, selector); w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 22: c5[2] = __byte_perm ( 0, w7[3], selector); c5[1] = __byte_perm (w7[3], w7[2], selector); c5[0] = __byte_perm (w7[2], w7[1], selector); c4[3] = __byte_perm (w7[1], w7[0], selector); c4[2] = __byte_perm (w7[0], w6[3], selector); c4[1] = __byte_perm (w6[3], w6[2], selector); c4[0] = __byte_perm (w6[2], w6[1], selector); c3[3] = __byte_perm (w6[1], w6[0], selector); c3[2] = __byte_perm (w6[0], w5[3], selector); c3[1] = __byte_perm (w5[3], w5[2], selector); c3[0] = __byte_perm (w5[2], w5[1], selector); c2[3] = __byte_perm (w5[1], w5[0], selector); c2[2] = __byte_perm (w5[0], w4[3], selector); c2[1] = __byte_perm (w4[3], w4[2], selector); c2[0] = __byte_perm (w4[2], w4[1], selector); c1[3] = __byte_perm (w4[1], w4[0], selector); c1[2] = __byte_perm (w4[0], w3[3], selector); c1[1] = __byte_perm (w3[3], w3[2], selector); c1[0] = __byte_perm (w3[2], w3[1], selector); c0[3] = __byte_perm (w3[1], w3[0], selector); c0[2] = __byte_perm (w3[0], w2[3], selector); c0[1] = __byte_perm (w2[3], w2[2], selector); c0[0] = __byte_perm (w2[2], w2[1], selector); w7[3] = __byte_perm (w2[1], w2[0], selector); w7[2] = __byte_perm (w2[0], w1[3], selector); w7[1] = __byte_perm (w1[3], w1[2], selector); w7[0] = __byte_perm (w1[2], w1[1], selector); w6[3] = __byte_perm (w1[1], w1[0], selector); w6[2] = __byte_perm (w1[0], w0[3], selector); w6[1] = __byte_perm (w0[3], w0[2], selector); w6[0] = __byte_perm (w0[2], w0[1], selector); w5[3] = __byte_perm (w0[1], w0[0], selector); w5[2] = __byte_perm (w0[0], 0, selector); w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 23: c5[3] = __byte_perm ( 0, w7[3], selector); c5[2] = __byte_perm (w7[3], w7[2], selector); c5[1] = __byte_perm (w7[2], w7[1], selector); c5[0] = __byte_perm (w7[1], w7[0], selector); c4[3] = __byte_perm (w7[0], w6[3], selector); c4[2] = __byte_perm (w6[3], w6[2], selector); c4[1] = __byte_perm (w6[2], w6[1], selector); c4[0] = __byte_perm (w6[1], w6[0], selector); c3[3] = __byte_perm (w6[0], w5[3], selector); c3[2] = __byte_perm (w5[3], w5[2], selector); c3[1] = __byte_perm (w5[2], w5[1], selector); c3[0] = __byte_perm (w5[1], w5[0], selector); c2[3] = __byte_perm (w5[0], w4[3], selector); c2[2] = __byte_perm (w4[3], w4[2], selector); c2[1] = __byte_perm (w4[2], w4[1], selector); c2[0] = __byte_perm (w4[1], w4[0], selector); c1[3] = __byte_perm (w4[0], w3[3], selector); c1[2] = __byte_perm (w3[3], w3[2], selector); c1[1] = __byte_perm (w3[2], w3[1], selector); c1[0] = __byte_perm (w3[1], w3[0], selector); c0[3] = __byte_perm (w3[0], w2[3], selector); c0[2] = __byte_perm (w2[3], w2[2], selector); c0[1] = __byte_perm (w2[2], w2[1], selector); c0[0] = __byte_perm (w2[1], w2[0], selector); w7[3] = __byte_perm (w2[0], w1[3], selector); w7[2] = __byte_perm (w1[3], w1[2], selector); w7[1] = __byte_perm (w1[2], w1[1], selector); w7[0] = __byte_perm (w1[1], w1[0], selector); w6[3] = __byte_perm (w1[0], w0[3], selector); w6[2] = __byte_perm (w0[3], w0[2], selector); w6[1] = __byte_perm (w0[2], w0[1], selector); w6[0] = __byte_perm (w0[1], w0[0], selector); w5[3] = __byte_perm (w0[0], 0, selector); w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 24: c6[0] = __byte_perm ( 0, w7[3], selector); c5[3] = __byte_perm (w7[3], w7[2], selector); c5[2] = __byte_perm (w7[2], w7[1], selector); c5[1] = __byte_perm (w7[1], w7[0], selector); c5[0] = __byte_perm (w7[0], w6[3], selector); c4[3] = __byte_perm (w6[3], w6[2], selector); c4[2] = __byte_perm (w6[2], w6[1], selector); c4[1] = __byte_perm (w6[1], w6[0], selector); c4[0] = __byte_perm (w6[0], w5[3], selector); c3[3] = __byte_perm (w5[3], w5[2], selector); c3[2] = __byte_perm (w5[2], w5[1], selector); c3[1] = __byte_perm (w5[1], w5[0], selector); c3[0] = __byte_perm (w5[0], w4[3], selector); c2[3] = __byte_perm (w4[3], w4[2], selector); c2[2] = __byte_perm (w4[2], w4[1], selector); c2[1] = __byte_perm (w4[1], w4[0], selector); c2[0] = __byte_perm (w4[0], w3[3], selector); c1[3] = __byte_perm (w3[3], w3[2], selector); c1[2] = __byte_perm (w3[2], w3[1], selector); c1[1] = __byte_perm (w3[1], w3[0], selector); c1[0] = __byte_perm (w3[0], w2[3], selector); c0[3] = __byte_perm (w2[3], w2[2], selector); c0[2] = __byte_perm (w2[2], w2[1], selector); c0[1] = __byte_perm (w2[1], w2[0], selector); c0[0] = __byte_perm (w2[0], w1[3], selector); w7[3] = __byte_perm (w1[3], w1[2], selector); w7[2] = __byte_perm (w1[2], w1[1], selector); w7[1] = __byte_perm (w1[1], w1[0], selector); w7[0] = __byte_perm (w1[0], w0[3], selector); w6[3] = __byte_perm (w0[3], w0[2], selector); w6[2] = __byte_perm (w0[2], w0[1], selector); w6[1] = __byte_perm (w0[1], w0[0], selector); w6[0] = __byte_perm (w0[0], 0, selector); w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 25: c6[1] = __byte_perm ( 0, w7[3], selector); c6[0] = __byte_perm (w7[3], w7[2], selector); c5[3] = __byte_perm (w7[2], w7[1], selector); c5[2] = __byte_perm (w7[1], w7[0], selector); c5[1] = __byte_perm (w7[0], w6[3], selector); c5[0] = __byte_perm (w6[3], w6[2], selector); c4[3] = __byte_perm (w6[2], w6[1], selector); c4[2] = __byte_perm (w6[1], w6[0], selector); c4[1] = __byte_perm (w6[0], w5[3], selector); c4[0] = __byte_perm (w5[3], w5[2], selector); c3[3] = __byte_perm (w5[2], w5[1], selector); c3[2] = __byte_perm (w5[1], w5[0], selector); c3[1] = __byte_perm (w5[0], w4[3], selector); c3[0] = __byte_perm (w4[3], w4[2], selector); c2[3] = __byte_perm (w4[2], w4[1], selector); c2[2] = __byte_perm (w4[1], w4[0], selector); c2[1] = __byte_perm (w4[0], w3[3], selector); c2[0] = __byte_perm (w3[3], w3[2], selector); c1[3] = __byte_perm (w3[2], w3[1], selector); c1[2] = __byte_perm (w3[1], w3[0], selector); c1[1] = __byte_perm (w3[0], w2[3], selector); c1[0] = __byte_perm (w2[3], w2[2], selector); c0[3] = __byte_perm (w2[2], w2[1], selector); c0[2] = __byte_perm (w2[1], w2[0], selector); c0[1] = __byte_perm (w2[0], w1[3], selector); c0[0] = __byte_perm (w1[3], w1[2], selector); w7[3] = __byte_perm (w1[2], w1[1], selector); w7[2] = __byte_perm (w1[1], w1[0], selector); w7[1] = __byte_perm (w1[0], w0[3], selector); w7[0] = __byte_perm (w0[3], w0[2], selector); w6[3] = __byte_perm (w0[2], w0[1], selector); w6[2] = __byte_perm (w0[1], w0[0], selector); w6[1] = __byte_perm (w0[0], 0, selector); w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 26: c6[2] = __byte_perm ( 0, w7[3], selector); c6[1] = __byte_perm (w7[3], w7[2], selector); c6[0] = __byte_perm (w7[2], w7[1], selector); c5[3] = __byte_perm (w7[1], w7[0], selector); c5[2] = __byte_perm (w7[0], w6[3], selector); c5[1] = __byte_perm (w6[3], w6[2], selector); c5[0] = __byte_perm (w6[2], w6[1], selector); c4[3] = __byte_perm (w6[1], w6[0], selector); c4[2] = __byte_perm (w6[0], w5[3], selector); c4[1] = __byte_perm (w5[3], w5[2], selector); c4[0] = __byte_perm (w5[2], w5[1], selector); c3[3] = __byte_perm (w5[1], w5[0], selector); c3[2] = __byte_perm (w5[0], w4[3], selector); c3[1] = __byte_perm (w4[3], w4[2], selector); c3[0] = __byte_perm (w4[2], w4[1], selector); c2[3] = __byte_perm (w4[1], w4[0], selector); c2[2] = __byte_perm (w4[0], w3[3], selector); c2[1] = __byte_perm (w3[3], w3[2], selector); c2[0] = __byte_perm (w3[2], w3[1], selector); c1[3] = __byte_perm (w3[1], w3[0], selector); c1[2] = __byte_perm (w3[0], w2[3], selector); c1[1] = __byte_perm (w2[3], w2[2], selector); c1[0] = __byte_perm (w2[2], w2[1], selector); c0[3] = __byte_perm (w2[1], w2[0], selector); c0[2] = __byte_perm (w2[0], w1[3], selector); c0[1] = __byte_perm (w1[3], w1[2], selector); c0[0] = __byte_perm (w1[2], w1[1], selector); w7[3] = __byte_perm (w1[1], w1[0], selector); w7[2] = __byte_perm (w1[0], w0[3], selector); w7[1] = __byte_perm (w0[3], w0[2], selector); w7[0] = __byte_perm (w0[2], w0[1], selector); w6[3] = __byte_perm (w0[1], w0[0], selector); w6[2] = __byte_perm (w0[0], 0, selector); w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 27: c6[3] = __byte_perm ( 0, w7[3], selector); c6[2] = __byte_perm (w7[3], w7[2], selector); c6[1] = __byte_perm (w7[2], w7[1], selector); c6[0] = __byte_perm (w7[1], w7[0], selector); c5[3] = __byte_perm (w7[0], w6[3], selector); c5[2] = __byte_perm (w6[3], w6[2], selector); c5[1] = __byte_perm (w6[2], w6[1], selector); c5[0] = __byte_perm (w6[1], w6[0], selector); c4[3] = __byte_perm (w6[0], w5[3], selector); c4[2] = __byte_perm (w5[3], w5[2], selector); c4[1] = __byte_perm (w5[2], w5[1], selector); c4[0] = __byte_perm (w5[1], w5[0], selector); c3[3] = __byte_perm (w5[0], w4[3], selector); c3[2] = __byte_perm (w4[3], w4[2], selector); c3[1] = __byte_perm (w4[2], w4[1], selector); c3[0] = __byte_perm (w4[1], w4[0], selector); c2[3] = __byte_perm (w4[0], w3[3], selector); c2[2] = __byte_perm (w3[3], w3[2], selector); c2[1] = __byte_perm (w3[2], w3[1], selector); c2[0] = __byte_perm (w3[1], w3[0], selector); c1[3] = __byte_perm (w3[0], w2[3], selector); c1[2] = __byte_perm (w2[3], w2[2], selector); c1[1] = __byte_perm (w2[2], w2[1], selector); c1[0] = __byte_perm (w2[1], w2[0], selector); c0[3] = __byte_perm (w2[0], w1[3], selector); c0[2] = __byte_perm (w1[3], w1[2], selector); c0[1] = __byte_perm (w1[2], w1[1], selector); c0[0] = __byte_perm (w1[1], w1[0], selector); w7[3] = __byte_perm (w1[0], w0[3], selector); w7[2] = __byte_perm (w0[3], w0[2], selector); w7[1] = __byte_perm (w0[2], w0[1], selector); w7[0] = __byte_perm (w0[1], w0[0], selector); w6[3] = __byte_perm (w0[0], 0, selector); w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 28: c7[0] = __byte_perm ( 0, w7[3], selector); c6[3] = __byte_perm (w7[3], w7[2], selector); c6[2] = __byte_perm (w7[2], w7[1], selector); c6[1] = __byte_perm (w7[1], w7[0], selector); c6[0] = __byte_perm (w7[0], w6[3], selector); c5[3] = __byte_perm (w6[3], w6[2], selector); c5[2] = __byte_perm (w6[2], w6[1], selector); c5[1] = __byte_perm (w6[1], w6[0], selector); c5[0] = __byte_perm (w6[0], w5[3], selector); c4[3] = __byte_perm (w5[3], w5[2], selector); c4[2] = __byte_perm (w5[2], w5[1], selector); c4[1] = __byte_perm (w5[1], w5[0], selector); c4[0] = __byte_perm (w5[0], w4[3], selector); c3[3] = __byte_perm (w4[3], w4[2], selector); c3[2] = __byte_perm (w4[2], w4[1], selector); c3[1] = __byte_perm (w4[1], w4[0], selector); c3[0] = __byte_perm (w4[0], w3[3], selector); c2[3] = __byte_perm (w3[3], w3[2], selector); c2[2] = __byte_perm (w3[2], w3[1], selector); c2[1] = __byte_perm (w3[1], w3[0], selector); c2[0] = __byte_perm (w3[0], w2[3], selector); c1[3] = __byte_perm (w2[3], w2[2], selector); c1[2] = __byte_perm (w2[2], w2[1], selector); c1[1] = __byte_perm (w2[1], w2[0], selector); c1[0] = __byte_perm (w2[0], w1[3], selector); c0[3] = __byte_perm (w1[3], w1[2], selector); c0[2] = __byte_perm (w1[2], w1[1], selector); c0[1] = __byte_perm (w1[1], w1[0], selector); c0[0] = __byte_perm (w1[0], w0[3], selector); w7[3] = __byte_perm (w0[3], w0[2], selector); w7[2] = __byte_perm (w0[2], w0[1], selector); w7[1] = __byte_perm (w0[1], w0[0], selector); w7[0] = __byte_perm (w0[0], 0, selector); w6[3] = 0; w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 29: c7[1] = __byte_perm ( 0, w7[3], selector); c7[0] = __byte_perm (w7[3], w7[2], selector); c6[3] = __byte_perm (w7[2], w7[1], selector); c6[2] = __byte_perm (w7[1], w7[0], selector); c6[1] = __byte_perm (w7[0], w6[3], selector); c6[0] = __byte_perm (w6[3], w6[2], selector); c5[3] = __byte_perm (w6[2], w6[1], selector); c5[2] = __byte_perm (w6[1], w6[0], selector); c5[1] = __byte_perm (w6[0], w5[3], selector); c5[0] = __byte_perm (w5[3], w5[2], selector); c4[3] = __byte_perm (w5[2], w5[1], selector); c4[2] = __byte_perm (w5[1], w5[0], selector); c4[1] = __byte_perm (w5[0], w4[3], selector); c4[0] = __byte_perm (w4[3], w4[2], selector); c3[3] = __byte_perm (w4[2], w4[1], selector); c3[2] = __byte_perm (w4[1], w4[0], selector); c3[1] = __byte_perm (w4[0], w3[3], selector); c3[0] = __byte_perm (w3[3], w3[2], selector); c2[3] = __byte_perm (w3[2], w3[1], selector); c2[2] = __byte_perm (w3[1], w3[0], selector); c2[1] = __byte_perm (w3[0], w2[3], selector); c2[0] = __byte_perm (w2[3], w2[2], selector); c1[3] = __byte_perm (w2[2], w2[1], selector); c1[2] = __byte_perm (w2[1], w2[0], selector); c1[1] = __byte_perm (w2[0], w1[3], selector); c1[0] = __byte_perm (w1[3], w1[2], selector); c0[3] = __byte_perm (w1[2], w1[1], selector); c0[2] = __byte_perm (w1[1], w1[0], selector); c0[1] = __byte_perm (w1[0], w0[3], selector); c0[0] = __byte_perm (w0[3], w0[2], selector); w7[3] = __byte_perm (w0[2], w0[1], selector); w7[2] = __byte_perm (w0[1], w0[0], selector); w7[1] = __byte_perm (w0[0], 0, selector); w7[0] = 0; w6[3] = 0; w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 30: c7[2] = __byte_perm ( 0, w7[3], selector); c7[1] = __byte_perm (w7[3], w7[2], selector); c7[0] = __byte_perm (w7[2], w7[1], selector); c6[3] = __byte_perm (w7[1], w7[0], selector); c6[2] = __byte_perm (w7[0], w6[3], selector); c6[1] = __byte_perm (w6[3], w6[2], selector); c6[0] = __byte_perm (w6[2], w6[1], selector); c5[3] = __byte_perm (w6[1], w6[0], selector); c5[2] = __byte_perm (w6[0], w5[3], selector); c5[1] = __byte_perm (w5[3], w5[2], selector); c5[0] = __byte_perm (w5[2], w5[1], selector); c4[3] = __byte_perm (w5[1], w5[0], selector); c4[2] = __byte_perm (w5[0], w4[3], selector); c4[1] = __byte_perm (w4[3], w4[2], selector); c4[0] = __byte_perm (w4[2], w4[1], selector); c3[3] = __byte_perm (w4[1], w4[0], selector); c3[2] = __byte_perm (w4[0], w3[3], selector); c3[1] = __byte_perm (w3[3], w3[2], selector); c3[0] = __byte_perm (w3[2], w3[1], selector); c2[3] = __byte_perm (w3[1], w3[0], selector); c2[2] = __byte_perm (w3[0], w2[3], selector); c2[1] = __byte_perm (w2[3], w2[2], selector); c2[0] = __byte_perm (w2[2], w2[1], selector); c1[3] = __byte_perm (w2[1], w2[0], selector); c1[2] = __byte_perm (w2[0], w1[3], selector); c1[1] = __byte_perm (w1[3], w1[2], selector); c1[0] = __byte_perm (w1[2], w1[1], selector); c0[3] = __byte_perm (w1[1], w1[0], selector); c0[2] = __byte_perm (w1[0], w0[3], selector); c0[1] = __byte_perm (w0[3], w0[2], selector); c0[0] = __byte_perm (w0[2], w0[1], selector); w7[3] = __byte_perm (w0[1], w0[0], selector); w7[2] = __byte_perm (w0[0], 0, selector); w7[1] = 0; w7[0] = 0; w6[3] = 0; w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 31: c7[3] = __byte_perm ( 0, w7[3], selector); c7[2] = __byte_perm (w7[3], w7[2], selector); c7[1] = __byte_perm (w7[2], w7[1], selector); c7[0] = __byte_perm (w7[1], w7[0], selector); c6[3] = __byte_perm (w7[0], w6[3], selector); c6[2] = __byte_perm (w6[3], w6[2], selector); c6[1] = __byte_perm (w6[2], w6[1], selector); c6[0] = __byte_perm (w6[1], w6[0], selector); c5[3] = __byte_perm (w6[0], w5[3], selector); c5[2] = __byte_perm (w5[3], w5[2], selector); c5[1] = __byte_perm (w5[2], w5[1], selector); c5[0] = __byte_perm (w5[1], w5[0], selector); c4[3] = __byte_perm (w5[0], w4[3], selector); c4[2] = __byte_perm (w4[3], w4[2], selector); c4[1] = __byte_perm (w4[2], w4[1], selector); c4[0] = __byte_perm (w4[1], w4[0], selector); c3[3] = __byte_perm (w4[0], w3[3], selector); c3[2] = __byte_perm (w3[3], w3[2], selector); c3[1] = __byte_perm (w3[2], w3[1], selector); c3[0] = __byte_perm (w3[1], w3[0], selector); c2[3] = __byte_perm (w3[0], w2[3], selector); c2[2] = __byte_perm (w2[3], w2[2], selector); c2[1] = __byte_perm (w2[2], w2[1], selector); c2[0] = __byte_perm (w2[1], w2[0], selector); c1[3] = __byte_perm (w2[0], w1[3], selector); c1[2] = __byte_perm (w1[3], w1[2], selector); c1[1] = __byte_perm (w1[2], w1[1], selector); c1[0] = __byte_perm (w1[1], w1[0], selector); c0[3] = __byte_perm (w1[0], w0[3], selector); c0[2] = __byte_perm (w0[3], w0[2], selector); c0[1] = __byte_perm (w0[2], w0[1], selector); c0[0] = __byte_perm (w0[1], w0[0], selector); w7[3] = __byte_perm (w0[0], 0, selector); w7[2] = 0; w7[1] = 0; w7[0] = 0; w6[3] = 0; w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; } #endif } DECLSPEC void switch_buffer_by_offset_1x64_le (u32x w[64], const u32 offset) { const int offset_mod_4 = offset & 3; const int offset_minus_4 = 4 - offset_mod_4; const int offset_switch = offset / 4; #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC #pragma unroll for (int i = 0; i < 64; i++) w[i] = swap32 (w[i]); switch (offset_switch) { case 0: w[63] = amd_bytealign (w[62], w[63], offset); w[62] = amd_bytealign (w[61], w[62], offset); w[61] = amd_bytealign (w[60], w[61], offset); w[60] = amd_bytealign (w[59], w[60], offset); w[59] = amd_bytealign (w[58], w[59], offset); w[58] = amd_bytealign (w[57], w[58], offset); w[57] = amd_bytealign (w[56], w[57], offset); w[56] = amd_bytealign (w[55], w[56], offset); w[55] = amd_bytealign (w[54], w[55], offset); w[54] = amd_bytealign (w[53], w[54], offset); w[53] = amd_bytealign (w[52], w[53], offset); w[52] = amd_bytealign (w[51], w[52], offset); w[51] = amd_bytealign (w[50], w[51], offset); w[50] = amd_bytealign (w[49], w[50], offset); w[49] = amd_bytealign (w[48], w[49], offset); w[48] = amd_bytealign (w[47], w[48], offset); w[47] = amd_bytealign (w[46], w[47], offset); w[46] = amd_bytealign (w[45], w[46], offset); w[45] = amd_bytealign (w[44], w[45], offset); w[44] = amd_bytealign (w[43], w[44], offset); w[43] = amd_bytealign (w[42], w[43], offset); w[42] = amd_bytealign (w[41], w[42], offset); w[41] = amd_bytealign (w[40], w[41], offset); w[40] = amd_bytealign (w[39], w[40], offset); w[39] = amd_bytealign (w[38], w[39], offset); w[38] = amd_bytealign (w[37], w[38], offset); w[37] = amd_bytealign (w[36], w[37], offset); w[36] = amd_bytealign (w[35], w[36], offset); w[35] = amd_bytealign (w[34], w[35], offset); w[34] = amd_bytealign (w[33], w[34], offset); w[33] = amd_bytealign (w[32], w[33], offset); w[32] = amd_bytealign (w[31], w[32], offset); w[31] = amd_bytealign (w[30], w[31], offset); w[30] = amd_bytealign (w[29], w[30], offset); w[29] = amd_bytealign (w[28], w[29], offset); w[28] = amd_bytealign (w[27], w[28], offset); w[27] = amd_bytealign (w[26], w[27], offset); w[26] = amd_bytealign (w[25], w[26], offset); w[25] = amd_bytealign (w[24], w[25], offset); w[24] = amd_bytealign (w[23], w[24], offset); w[23] = amd_bytealign (w[22], w[23], offset); w[22] = amd_bytealign (w[21], w[22], offset); w[21] = amd_bytealign (w[20], w[21], offset); w[20] = amd_bytealign (w[19], w[20], offset); w[19] = amd_bytealign (w[18], w[19], offset); w[18] = amd_bytealign (w[17], w[18], offset); w[17] = amd_bytealign (w[16], w[17], offset); w[16] = amd_bytealign (w[15], w[16], offset); w[15] = amd_bytealign (w[14], w[15], offset); w[14] = amd_bytealign (w[13], w[14], offset); w[13] = amd_bytealign (w[12], w[13], offset); w[12] = amd_bytealign (w[11], w[12], offset); w[11] = amd_bytealign (w[10], w[11], offset); w[10] = amd_bytealign (w[ 9], w[10], offset); w[ 9] = amd_bytealign (w[ 8], w[ 9], offset); w[ 8] = amd_bytealign (w[ 7], w[ 8], offset); w[ 7] = amd_bytealign (w[ 6], w[ 7], offset); w[ 6] = amd_bytealign (w[ 5], w[ 6], offset); w[ 5] = amd_bytealign (w[ 4], w[ 5], offset); w[ 4] = amd_bytealign (w[ 3], w[ 4], offset); w[ 3] = amd_bytealign (w[ 2], w[ 3], offset); w[ 2] = amd_bytealign (w[ 1], w[ 2], offset); w[ 1] = amd_bytealign (w[ 0], w[ 1], offset); w[ 0] = amd_bytealign ( 0, w[ 0], offset); break; case 1: w[63] = amd_bytealign (w[61], w[62], offset); w[62] = amd_bytealign (w[60], w[61], offset); w[61] = amd_bytealign (w[59], w[60], offset); w[60] = amd_bytealign (w[58], w[59], offset); w[59] = amd_bytealign (w[57], w[58], offset); w[58] = amd_bytealign (w[56], w[57], offset); w[57] = amd_bytealign (w[55], w[56], offset); w[56] = amd_bytealign (w[54], w[55], offset); w[55] = amd_bytealign (w[53], w[54], offset); w[54] = amd_bytealign (w[52], w[53], offset); w[53] = amd_bytealign (w[51], w[52], offset); w[52] = amd_bytealign (w[50], w[51], offset); w[51] = amd_bytealign (w[49], w[50], offset); w[50] = amd_bytealign (w[48], w[49], offset); w[49] = amd_bytealign (w[47], w[48], offset); w[48] = amd_bytealign (w[46], w[47], offset); w[47] = amd_bytealign (w[45], w[46], offset); w[46] = amd_bytealign (w[44], w[45], offset); w[45] = amd_bytealign (w[43], w[44], offset); w[44] = amd_bytealign (w[42], w[43], offset); w[43] = amd_bytealign (w[41], w[42], offset); w[42] = amd_bytealign (w[40], w[41], offset); w[41] = amd_bytealign (w[39], w[40], offset); w[40] = amd_bytealign (w[38], w[39], offset); w[39] = amd_bytealign (w[37], w[38], offset); w[38] = amd_bytealign (w[36], w[37], offset); w[37] = amd_bytealign (w[35], w[36], offset); w[36] = amd_bytealign (w[34], w[35], offset); w[35] = amd_bytealign (w[33], w[34], offset); w[34] = amd_bytealign (w[32], w[33], offset); w[33] = amd_bytealign (w[31], w[32], offset); w[32] = amd_bytealign (w[30], w[31], offset); w[31] = amd_bytealign (w[29], w[30], offset); w[30] = amd_bytealign (w[28], w[29], offset); w[29] = amd_bytealign (w[27], w[28], offset); w[28] = amd_bytealign (w[26], w[27], offset); w[27] = amd_bytealign (w[25], w[26], offset); w[26] = amd_bytealign (w[24], w[25], offset); w[25] = amd_bytealign (w[23], w[24], offset); w[24] = amd_bytealign (w[22], w[23], offset); w[23] = amd_bytealign (w[21], w[22], offset); w[22] = amd_bytealign (w[20], w[21], offset); w[21] = amd_bytealign (w[19], w[20], offset); w[20] = amd_bytealign (w[18], w[19], offset); w[19] = amd_bytealign (w[17], w[18], offset); w[18] = amd_bytealign (w[16], w[17], offset); w[17] = amd_bytealign (w[15], w[16], offset); w[16] = amd_bytealign (w[14], w[15], offset); w[15] = amd_bytealign (w[13], w[14], offset); w[14] = amd_bytealign (w[12], w[13], offset); w[13] = amd_bytealign (w[11], w[12], offset); w[12] = amd_bytealign (w[10], w[11], offset); w[11] = amd_bytealign (w[ 9], w[10], offset); w[10] = amd_bytealign (w[ 8], w[ 9], offset); w[ 9] = amd_bytealign (w[ 7], w[ 8], offset); w[ 8] = amd_bytealign (w[ 6], w[ 7], offset); w[ 7] = amd_bytealign (w[ 5], w[ 6], offset); w[ 6] = amd_bytealign (w[ 4], w[ 5], offset); w[ 5] = amd_bytealign (w[ 3], w[ 4], offset); w[ 4] = amd_bytealign (w[ 2], w[ 3], offset); w[ 3] = amd_bytealign (w[ 1], w[ 2], offset); w[ 2] = amd_bytealign (w[ 0], w[ 1], offset); w[ 1] = amd_bytealign ( 0, w[ 0], offset); w[ 0] = 0; break; case 2: w[63] = amd_bytealign (w[60], w[61], offset); w[62] = amd_bytealign (w[59], w[60], offset); w[61] = amd_bytealign (w[58], w[59], offset); w[60] = amd_bytealign (w[57], w[58], offset); w[59] = amd_bytealign (w[56], w[57], offset); w[58] = amd_bytealign (w[55], w[56], offset); w[57] = amd_bytealign (w[54], w[55], offset); w[56] = amd_bytealign (w[53], w[54], offset); w[55] = amd_bytealign (w[52], w[53], offset); w[54] = amd_bytealign (w[51], w[52], offset); w[53] = amd_bytealign (w[50], w[51], offset); w[52] = amd_bytealign (w[49], w[50], offset); w[51] = amd_bytealign (w[48], w[49], offset); w[50] = amd_bytealign (w[47], w[48], offset); w[49] = amd_bytealign (w[46], w[47], offset); w[48] = amd_bytealign (w[45], w[46], offset); w[47] = amd_bytealign (w[44], w[45], offset); w[46] = amd_bytealign (w[43], w[44], offset); w[45] = amd_bytealign (w[42], w[43], offset); w[44] = amd_bytealign (w[41], w[42], offset); w[43] = amd_bytealign (w[40], w[41], offset); w[42] = amd_bytealign (w[39], w[40], offset); w[41] = amd_bytealign (w[38], w[39], offset); w[40] = amd_bytealign (w[37], w[38], offset); w[39] = amd_bytealign (w[36], w[37], offset); w[38] = amd_bytealign (w[35], w[36], offset); w[37] = amd_bytealign (w[34], w[35], offset); w[36] = amd_bytealign (w[33], w[34], offset); w[35] = amd_bytealign (w[32], w[33], offset); w[34] = amd_bytealign (w[31], w[32], offset); w[33] = amd_bytealign (w[30], w[31], offset); w[32] = amd_bytealign (w[29], w[30], offset); w[31] = amd_bytealign (w[28], w[29], offset); w[30] = amd_bytealign (w[27], w[28], offset); w[29] = amd_bytealign (w[26], w[27], offset); w[28] = amd_bytealign (w[25], w[26], offset); w[27] = amd_bytealign (w[24], w[25], offset); w[26] = amd_bytealign (w[23], w[24], offset); w[25] = amd_bytealign (w[22], w[23], offset); w[24] = amd_bytealign (w[21], w[22], offset); w[23] = amd_bytealign (w[20], w[21], offset); w[22] = amd_bytealign (w[19], w[20], offset); w[21] = amd_bytealign (w[18], w[19], offset); w[20] = amd_bytealign (w[17], w[18], offset); w[19] = amd_bytealign (w[16], w[17], offset); w[18] = amd_bytealign (w[15], w[16], offset); w[17] = amd_bytealign (w[14], w[15], offset); w[16] = amd_bytealign (w[13], w[14], offset); w[15] = amd_bytealign (w[12], w[13], offset); w[14] = amd_bytealign (w[11], w[12], offset); w[13] = amd_bytealign (w[10], w[11], offset); w[12] = amd_bytealign (w[ 9], w[10], offset); w[11] = amd_bytealign (w[ 8], w[ 9], offset); w[10] = amd_bytealign (w[ 7], w[ 8], offset); w[ 9] = amd_bytealign (w[ 6], w[ 7], offset); w[ 8] = amd_bytealign (w[ 5], w[ 6], offset); w[ 7] = amd_bytealign (w[ 4], w[ 5], offset); w[ 6] = amd_bytealign (w[ 3], w[ 4], offset); w[ 5] = amd_bytealign (w[ 2], w[ 3], offset); w[ 4] = amd_bytealign (w[ 1], w[ 2], offset); w[ 3] = amd_bytealign (w[ 0], w[ 1], offset); w[ 2] = amd_bytealign ( 0, w[ 0], offset); w[ 1] = 0; w[ 0] = 0; break; case 3: w[63] = amd_bytealign (w[59], w[60], offset); w[62] = amd_bytealign (w[58], w[59], offset); w[61] = amd_bytealign (w[57], w[58], offset); w[60] = amd_bytealign (w[56], w[57], offset); w[59] = amd_bytealign (w[55], w[56], offset); w[58] = amd_bytealign (w[54], w[55], offset); w[57] = amd_bytealign (w[53], w[54], offset); w[56] = amd_bytealign (w[52], w[53], offset); w[55] = amd_bytealign (w[51], w[52], offset); w[54] = amd_bytealign (w[50], w[51], offset); w[53] = amd_bytealign (w[49], w[50], offset); w[52] = amd_bytealign (w[48], w[49], offset); w[51] = amd_bytealign (w[47], w[48], offset); w[50] = amd_bytealign (w[46], w[47], offset); w[49] = amd_bytealign (w[45], w[46], offset); w[48] = amd_bytealign (w[44], w[45], offset); w[47] = amd_bytealign (w[43], w[44], offset); w[46] = amd_bytealign (w[42], w[43], offset); w[45] = amd_bytealign (w[41], w[42], offset); w[44] = amd_bytealign (w[40], w[41], offset); w[43] = amd_bytealign (w[39], w[40], offset); w[42] = amd_bytealign (w[38], w[39], offset); w[41] = amd_bytealign (w[37], w[38], offset); w[40] = amd_bytealign (w[36], w[37], offset); w[39] = amd_bytealign (w[35], w[36], offset); w[38] = amd_bytealign (w[34], w[35], offset); w[37] = amd_bytealign (w[33], w[34], offset); w[36] = amd_bytealign (w[32], w[33], offset); w[35] = amd_bytealign (w[31], w[32], offset); w[34] = amd_bytealign (w[30], w[31], offset); w[33] = amd_bytealign (w[29], w[30], offset); w[32] = amd_bytealign (w[28], w[29], offset); w[31] = amd_bytealign (w[27], w[28], offset); w[30] = amd_bytealign (w[26], w[27], offset); w[29] = amd_bytealign (w[25], w[26], offset); w[28] = amd_bytealign (w[24], w[25], offset); w[27] = amd_bytealign (w[23], w[24], offset); w[26] = amd_bytealign (w[22], w[23], offset); w[25] = amd_bytealign (w[21], w[22], offset); w[24] = amd_bytealign (w[20], w[21], offset); w[23] = amd_bytealign (w[19], w[20], offset); w[22] = amd_bytealign (w[18], w[19], offset); w[21] = amd_bytealign (w[17], w[18], offset); w[20] = amd_bytealign (w[16], w[17], offset); w[19] = amd_bytealign (w[15], w[16], offset); w[18] = amd_bytealign (w[14], w[15], offset); w[17] = amd_bytealign (w[13], w[14], offset); w[16] = amd_bytealign (w[12], w[13], offset); w[15] = amd_bytealign (w[11], w[12], offset); w[14] = amd_bytealign (w[10], w[11], offset); w[13] = amd_bytealign (w[ 9], w[10], offset); w[12] = amd_bytealign (w[ 8], w[ 9], offset); w[11] = amd_bytealign (w[ 7], w[ 8], offset); w[10] = amd_bytealign (w[ 6], w[ 7], offset); w[ 9] = amd_bytealign (w[ 5], w[ 6], offset); w[ 8] = amd_bytealign (w[ 4], w[ 5], offset); w[ 7] = amd_bytealign (w[ 3], w[ 4], offset); w[ 6] = amd_bytealign (w[ 2], w[ 3], offset); w[ 5] = amd_bytealign (w[ 1], w[ 2], offset); w[ 4] = amd_bytealign (w[ 0], w[ 1], offset); w[ 3] = amd_bytealign ( 0, w[ 0], offset); w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 4: w[63] = amd_bytealign (w[58], w[59], offset); w[62] = amd_bytealign (w[57], w[58], offset); w[61] = amd_bytealign (w[56], w[57], offset); w[60] = amd_bytealign (w[55], w[56], offset); w[59] = amd_bytealign (w[54], w[55], offset); w[58] = amd_bytealign (w[53], w[54], offset); w[57] = amd_bytealign (w[52], w[53], offset); w[56] = amd_bytealign (w[51], w[52], offset); w[55] = amd_bytealign (w[50], w[51], offset); w[54] = amd_bytealign (w[49], w[50], offset); w[53] = amd_bytealign (w[48], w[49], offset); w[52] = amd_bytealign (w[47], w[48], offset); w[51] = amd_bytealign (w[46], w[47], offset); w[50] = amd_bytealign (w[45], w[46], offset); w[49] = amd_bytealign (w[44], w[45], offset); w[48] = amd_bytealign (w[43], w[44], offset); w[47] = amd_bytealign (w[42], w[43], offset); w[46] = amd_bytealign (w[41], w[42], offset); w[45] = amd_bytealign (w[40], w[41], offset); w[44] = amd_bytealign (w[39], w[40], offset); w[43] = amd_bytealign (w[38], w[39], offset); w[42] = amd_bytealign (w[37], w[38], offset); w[41] = amd_bytealign (w[36], w[37], offset); w[40] = amd_bytealign (w[35], w[36], offset); w[39] = amd_bytealign (w[34], w[35], offset); w[38] = amd_bytealign (w[33], w[34], offset); w[37] = amd_bytealign (w[32], w[33], offset); w[36] = amd_bytealign (w[31], w[32], offset); w[35] = amd_bytealign (w[30], w[31], offset); w[34] = amd_bytealign (w[29], w[30], offset); w[33] = amd_bytealign (w[28], w[29], offset); w[32] = amd_bytealign (w[27], w[28], offset); w[31] = amd_bytealign (w[26], w[27], offset); w[30] = amd_bytealign (w[25], w[26], offset); w[29] = amd_bytealign (w[24], w[25], offset); w[28] = amd_bytealign (w[23], w[24], offset); w[27] = amd_bytealign (w[22], w[23], offset); w[26] = amd_bytealign (w[21], w[22], offset); w[25] = amd_bytealign (w[20], w[21], offset); w[24] = amd_bytealign (w[19], w[20], offset); w[23] = amd_bytealign (w[18], w[19], offset); w[22] = amd_bytealign (w[17], w[18], offset); w[21] = amd_bytealign (w[16], w[17], offset); w[20] = amd_bytealign (w[15], w[16], offset); w[19] = amd_bytealign (w[14], w[15], offset); w[18] = amd_bytealign (w[13], w[14], offset); w[17] = amd_bytealign (w[12], w[13], offset); w[16] = amd_bytealign (w[11], w[12], offset); w[15] = amd_bytealign (w[10], w[11], offset); w[14] = amd_bytealign (w[ 9], w[10], offset); w[13] = amd_bytealign (w[ 8], w[ 9], offset); w[12] = amd_bytealign (w[ 7], w[ 8], offset); w[11] = amd_bytealign (w[ 6], w[ 7], offset); w[10] = amd_bytealign (w[ 5], w[ 6], offset); w[ 9] = amd_bytealign (w[ 4], w[ 5], offset); w[ 8] = amd_bytealign (w[ 3], w[ 4], offset); w[ 7] = amd_bytealign (w[ 2], w[ 3], offset); w[ 6] = amd_bytealign (w[ 1], w[ 2], offset); w[ 5] = amd_bytealign (w[ 0], w[ 1], offset); w[ 4] = amd_bytealign ( 0, w[ 0], offset); w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 5: w[63] = amd_bytealign (w[57], w[58], offset); w[62] = amd_bytealign (w[56], w[57], offset); w[61] = amd_bytealign (w[55], w[56], offset); w[60] = amd_bytealign (w[54], w[55], offset); w[59] = amd_bytealign (w[53], w[54], offset); w[58] = amd_bytealign (w[52], w[53], offset); w[57] = amd_bytealign (w[51], w[52], offset); w[56] = amd_bytealign (w[50], w[51], offset); w[55] = amd_bytealign (w[49], w[50], offset); w[54] = amd_bytealign (w[48], w[49], offset); w[53] = amd_bytealign (w[47], w[48], offset); w[52] = amd_bytealign (w[46], w[47], offset); w[51] = amd_bytealign (w[45], w[46], offset); w[50] = amd_bytealign (w[44], w[45], offset); w[49] = amd_bytealign (w[43], w[44], offset); w[48] = amd_bytealign (w[42], w[43], offset); w[47] = amd_bytealign (w[41], w[42], offset); w[46] = amd_bytealign (w[40], w[41], offset); w[45] = amd_bytealign (w[39], w[40], offset); w[44] = amd_bytealign (w[38], w[39], offset); w[43] = amd_bytealign (w[37], w[38], offset); w[42] = amd_bytealign (w[36], w[37], offset); w[41] = amd_bytealign (w[35], w[36], offset); w[40] = amd_bytealign (w[34], w[35], offset); w[39] = amd_bytealign (w[33], w[34], offset); w[38] = amd_bytealign (w[32], w[33], offset); w[37] = amd_bytealign (w[31], w[32], offset); w[36] = amd_bytealign (w[30], w[31], offset); w[35] = amd_bytealign (w[29], w[30], offset); w[34] = amd_bytealign (w[28], w[29], offset); w[33] = amd_bytealign (w[27], w[28], offset); w[32] = amd_bytealign (w[26], w[27], offset); w[31] = amd_bytealign (w[25], w[26], offset); w[30] = amd_bytealign (w[24], w[25], offset); w[29] = amd_bytealign (w[23], w[24], offset); w[28] = amd_bytealign (w[22], w[23], offset); w[27] = amd_bytealign (w[21], w[22], offset); w[26] = amd_bytealign (w[20], w[21], offset); w[25] = amd_bytealign (w[19], w[20], offset); w[24] = amd_bytealign (w[18], w[19], offset); w[23] = amd_bytealign (w[17], w[18], offset); w[22] = amd_bytealign (w[16], w[17], offset); w[21] = amd_bytealign (w[15], w[16], offset); w[20] = amd_bytealign (w[14], w[15], offset); w[19] = amd_bytealign (w[13], w[14], offset); w[18] = amd_bytealign (w[12], w[13], offset); w[17] = amd_bytealign (w[11], w[12], offset); w[16] = amd_bytealign (w[10], w[11], offset); w[15] = amd_bytealign (w[ 9], w[10], offset); w[14] = amd_bytealign (w[ 8], w[ 9], offset); w[13] = amd_bytealign (w[ 7], w[ 8], offset); w[12] = amd_bytealign (w[ 6], w[ 7], offset); w[11] = amd_bytealign (w[ 5], w[ 6], offset); w[10] = amd_bytealign (w[ 4], w[ 5], offset); w[ 9] = amd_bytealign (w[ 3], w[ 4], offset); w[ 8] = amd_bytealign (w[ 2], w[ 3], offset); w[ 7] = amd_bytealign (w[ 1], w[ 2], offset); w[ 6] = amd_bytealign (w[ 0], w[ 1], offset); w[ 5] = amd_bytealign ( 0, w[ 0], offset); w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 6: w[63] = amd_bytealign (w[56], w[57], offset); w[62] = amd_bytealign (w[55], w[56], offset); w[61] = amd_bytealign (w[54], w[55], offset); w[60] = amd_bytealign (w[53], w[54], offset); w[59] = amd_bytealign (w[52], w[53], offset); w[58] = amd_bytealign (w[51], w[52], offset); w[57] = amd_bytealign (w[50], w[51], offset); w[56] = amd_bytealign (w[49], w[50], offset); w[55] = amd_bytealign (w[48], w[49], offset); w[54] = amd_bytealign (w[47], w[48], offset); w[53] = amd_bytealign (w[46], w[47], offset); w[52] = amd_bytealign (w[45], w[46], offset); w[51] = amd_bytealign (w[44], w[45], offset); w[50] = amd_bytealign (w[43], w[44], offset); w[49] = amd_bytealign (w[42], w[43], offset); w[48] = amd_bytealign (w[41], w[42], offset); w[47] = amd_bytealign (w[40], w[41], offset); w[46] = amd_bytealign (w[39], w[40], offset); w[45] = amd_bytealign (w[38], w[39], offset); w[44] = amd_bytealign (w[37], w[38], offset); w[43] = amd_bytealign (w[36], w[37], offset); w[42] = amd_bytealign (w[35], w[36], offset); w[41] = amd_bytealign (w[34], w[35], offset); w[40] = amd_bytealign (w[33], w[34], offset); w[39] = amd_bytealign (w[32], w[33], offset); w[38] = amd_bytealign (w[31], w[32], offset); w[37] = amd_bytealign (w[30], w[31], offset); w[36] = amd_bytealign (w[29], w[30], offset); w[35] = amd_bytealign (w[28], w[29], offset); w[34] = amd_bytealign (w[27], w[28], offset); w[33] = amd_bytealign (w[26], w[27], offset); w[32] = amd_bytealign (w[25], w[26], offset); w[31] = amd_bytealign (w[24], w[25], offset); w[30] = amd_bytealign (w[23], w[24], offset); w[29] = amd_bytealign (w[22], w[23], offset); w[28] = amd_bytealign (w[21], w[22], offset); w[27] = amd_bytealign (w[20], w[21], offset); w[26] = amd_bytealign (w[19], w[20], offset); w[25] = amd_bytealign (w[18], w[19], offset); w[24] = amd_bytealign (w[17], w[18], offset); w[23] = amd_bytealign (w[16], w[17], offset); w[22] = amd_bytealign (w[15], w[16], offset); w[21] = amd_bytealign (w[14], w[15], offset); w[20] = amd_bytealign (w[13], w[14], offset); w[19] = amd_bytealign (w[12], w[13], offset); w[18] = amd_bytealign (w[11], w[12], offset); w[17] = amd_bytealign (w[10], w[11], offset); w[16] = amd_bytealign (w[ 9], w[10], offset); w[15] = amd_bytealign (w[ 8], w[ 9], offset); w[14] = amd_bytealign (w[ 7], w[ 8], offset); w[13] = amd_bytealign (w[ 6], w[ 7], offset); w[12] = amd_bytealign (w[ 5], w[ 6], offset); w[11] = amd_bytealign (w[ 4], w[ 5], offset); w[10] = amd_bytealign (w[ 3], w[ 4], offset); w[ 9] = amd_bytealign (w[ 2], w[ 3], offset); w[ 8] = amd_bytealign (w[ 1], w[ 2], offset); w[ 7] = amd_bytealign (w[ 0], w[ 1], offset); w[ 6] = amd_bytealign ( 0, w[ 0], offset); w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 7: w[63] = amd_bytealign (w[55], w[56], offset); w[62] = amd_bytealign (w[54], w[55], offset); w[61] = amd_bytealign (w[53], w[54], offset); w[60] = amd_bytealign (w[52], w[53], offset); w[59] = amd_bytealign (w[51], w[52], offset); w[58] = amd_bytealign (w[50], w[51], offset); w[57] = amd_bytealign (w[49], w[50], offset); w[56] = amd_bytealign (w[48], w[49], offset); w[55] = amd_bytealign (w[47], w[48], offset); w[54] = amd_bytealign (w[46], w[47], offset); w[53] = amd_bytealign (w[45], w[46], offset); w[52] = amd_bytealign (w[44], w[45], offset); w[51] = amd_bytealign (w[43], w[44], offset); w[50] = amd_bytealign (w[42], w[43], offset); w[49] = amd_bytealign (w[41], w[42], offset); w[48] = amd_bytealign (w[40], w[41], offset); w[47] = amd_bytealign (w[39], w[40], offset); w[46] = amd_bytealign (w[38], w[39], offset); w[45] = amd_bytealign (w[37], w[38], offset); w[44] = amd_bytealign (w[36], w[37], offset); w[43] = amd_bytealign (w[35], w[36], offset); w[42] = amd_bytealign (w[34], w[35], offset); w[41] = amd_bytealign (w[33], w[34], offset); w[40] = amd_bytealign (w[32], w[33], offset); w[39] = amd_bytealign (w[31], w[32], offset); w[38] = amd_bytealign (w[30], w[31], offset); w[37] = amd_bytealign (w[29], w[30], offset); w[36] = amd_bytealign (w[28], w[29], offset); w[35] = amd_bytealign (w[27], w[28], offset); w[34] = amd_bytealign (w[26], w[27], offset); w[33] = amd_bytealign (w[25], w[26], offset); w[32] = amd_bytealign (w[24], w[25], offset); w[31] = amd_bytealign (w[23], w[24], offset); w[30] = amd_bytealign (w[22], w[23], offset); w[29] = amd_bytealign (w[21], w[22], offset); w[28] = amd_bytealign (w[20], w[21], offset); w[27] = amd_bytealign (w[19], w[20], offset); w[26] = amd_bytealign (w[18], w[19], offset); w[25] = amd_bytealign (w[17], w[18], offset); w[24] = amd_bytealign (w[16], w[17], offset); w[23] = amd_bytealign (w[15], w[16], offset); w[22] = amd_bytealign (w[14], w[15], offset); w[21] = amd_bytealign (w[13], w[14], offset); w[20] = amd_bytealign (w[12], w[13], offset); w[19] = amd_bytealign (w[11], w[12], offset); w[18] = amd_bytealign (w[10], w[11], offset); w[17] = amd_bytealign (w[ 9], w[10], offset); w[16] = amd_bytealign (w[ 8], w[ 9], offset); w[15] = amd_bytealign (w[ 7], w[ 8], offset); w[14] = amd_bytealign (w[ 6], w[ 7], offset); w[13] = amd_bytealign (w[ 5], w[ 6], offset); w[12] = amd_bytealign (w[ 4], w[ 5], offset); w[11] = amd_bytealign (w[ 3], w[ 4], offset); w[10] = amd_bytealign (w[ 2], w[ 3], offset); w[ 9] = amd_bytealign (w[ 1], w[ 2], offset); w[ 8] = amd_bytealign (w[ 0], w[ 1], offset); w[ 7] = amd_bytealign ( 0, w[ 0], offset); w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 8: w[63] = amd_bytealign (w[54], w[55], offset); w[62] = amd_bytealign (w[53], w[54], offset); w[61] = amd_bytealign (w[52], w[53], offset); w[60] = amd_bytealign (w[51], w[52], offset); w[59] = amd_bytealign (w[50], w[51], offset); w[58] = amd_bytealign (w[49], w[50], offset); w[57] = amd_bytealign (w[48], w[49], offset); w[56] = amd_bytealign (w[47], w[48], offset); w[55] = amd_bytealign (w[46], w[47], offset); w[54] = amd_bytealign (w[45], w[46], offset); w[53] = amd_bytealign (w[44], w[45], offset); w[52] = amd_bytealign (w[43], w[44], offset); w[51] = amd_bytealign (w[42], w[43], offset); w[50] = amd_bytealign (w[41], w[42], offset); w[49] = amd_bytealign (w[40], w[41], offset); w[48] = amd_bytealign (w[39], w[40], offset); w[47] = amd_bytealign (w[38], w[39], offset); w[46] = amd_bytealign (w[37], w[38], offset); w[45] = amd_bytealign (w[36], w[37], offset); w[44] = amd_bytealign (w[35], w[36], offset); w[43] = amd_bytealign (w[34], w[35], offset); w[42] = amd_bytealign (w[33], w[34], offset); w[41] = amd_bytealign (w[32], w[33], offset); w[40] = amd_bytealign (w[31], w[32], offset); w[39] = amd_bytealign (w[30], w[31], offset); w[38] = amd_bytealign (w[29], w[30], offset); w[37] = amd_bytealign (w[28], w[29], offset); w[36] = amd_bytealign (w[27], w[28], offset); w[35] = amd_bytealign (w[26], w[27], offset); w[34] = amd_bytealign (w[25], w[26], offset); w[33] = amd_bytealign (w[24], w[25], offset); w[32] = amd_bytealign (w[23], w[24], offset); w[31] = amd_bytealign (w[22], w[23], offset); w[30] = amd_bytealign (w[21], w[22], offset); w[29] = amd_bytealign (w[20], w[21], offset); w[28] = amd_bytealign (w[19], w[20], offset); w[27] = amd_bytealign (w[18], w[19], offset); w[26] = amd_bytealign (w[17], w[18], offset); w[25] = amd_bytealign (w[16], w[17], offset); w[24] = amd_bytealign (w[15], w[16], offset); w[23] = amd_bytealign (w[14], w[15], offset); w[22] = amd_bytealign (w[13], w[14], offset); w[21] = amd_bytealign (w[12], w[13], offset); w[20] = amd_bytealign (w[11], w[12], offset); w[19] = amd_bytealign (w[10], w[11], offset); w[18] = amd_bytealign (w[ 9], w[10], offset); w[17] = amd_bytealign (w[ 8], w[ 9], offset); w[16] = amd_bytealign (w[ 7], w[ 8], offset); w[15] = amd_bytealign (w[ 6], w[ 7], offset); w[14] = amd_bytealign (w[ 5], w[ 6], offset); w[13] = amd_bytealign (w[ 4], w[ 5], offset); w[12] = amd_bytealign (w[ 3], w[ 4], offset); w[11] = amd_bytealign (w[ 2], w[ 3], offset); w[10] = amd_bytealign (w[ 1], w[ 2], offset); w[ 9] = amd_bytealign (w[ 0], w[ 1], offset); w[ 8] = amd_bytealign ( 0, w[ 0], offset); w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 9: w[63] = amd_bytealign (w[53], w[54], offset); w[62] = amd_bytealign (w[52], w[53], offset); w[61] = amd_bytealign (w[51], w[52], offset); w[60] = amd_bytealign (w[50], w[51], offset); w[59] = amd_bytealign (w[49], w[50], offset); w[58] = amd_bytealign (w[48], w[49], offset); w[57] = amd_bytealign (w[47], w[48], offset); w[56] = amd_bytealign (w[46], w[47], offset); w[55] = amd_bytealign (w[45], w[46], offset); w[54] = amd_bytealign (w[44], w[45], offset); w[53] = amd_bytealign (w[43], w[44], offset); w[52] = amd_bytealign (w[42], w[43], offset); w[51] = amd_bytealign (w[41], w[42], offset); w[50] = amd_bytealign (w[40], w[41], offset); w[49] = amd_bytealign (w[39], w[40], offset); w[48] = amd_bytealign (w[38], w[39], offset); w[47] = amd_bytealign (w[37], w[38], offset); w[46] = amd_bytealign (w[36], w[37], offset); w[45] = amd_bytealign (w[35], w[36], offset); w[44] = amd_bytealign (w[34], w[35], offset); w[43] = amd_bytealign (w[33], w[34], offset); w[42] = amd_bytealign (w[32], w[33], offset); w[41] = amd_bytealign (w[31], w[32], offset); w[40] = amd_bytealign (w[30], w[31], offset); w[39] = amd_bytealign (w[29], w[30], offset); w[38] = amd_bytealign (w[28], w[29], offset); w[37] = amd_bytealign (w[27], w[28], offset); w[36] = amd_bytealign (w[26], w[27], offset); w[35] = amd_bytealign (w[25], w[26], offset); w[34] = amd_bytealign (w[24], w[25], offset); w[33] = amd_bytealign (w[23], w[24], offset); w[32] = amd_bytealign (w[22], w[23], offset); w[31] = amd_bytealign (w[21], w[22], offset); w[30] = amd_bytealign (w[20], w[21], offset); w[29] = amd_bytealign (w[19], w[20], offset); w[28] = amd_bytealign (w[18], w[19], offset); w[27] = amd_bytealign (w[17], w[18], offset); w[26] = amd_bytealign (w[16], w[17], offset); w[25] = amd_bytealign (w[15], w[16], offset); w[24] = amd_bytealign (w[14], w[15], offset); w[23] = amd_bytealign (w[13], w[14], offset); w[22] = amd_bytealign (w[12], w[13], offset); w[21] = amd_bytealign (w[11], w[12], offset); w[20] = amd_bytealign (w[10], w[11], offset); w[19] = amd_bytealign (w[ 9], w[10], offset); w[18] = amd_bytealign (w[ 8], w[ 9], offset); w[17] = amd_bytealign (w[ 7], w[ 8], offset); w[16] = amd_bytealign (w[ 6], w[ 7], offset); w[15] = amd_bytealign (w[ 5], w[ 6], offset); w[14] = amd_bytealign (w[ 4], w[ 5], offset); w[13] = amd_bytealign (w[ 3], w[ 4], offset); w[12] = amd_bytealign (w[ 2], w[ 3], offset); w[11] = amd_bytealign (w[ 1], w[ 2], offset); w[10] = amd_bytealign (w[ 0], w[ 1], offset); w[ 9] = amd_bytealign ( 0, w[ 0], offset); w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 10: w[63] = amd_bytealign (w[52], w[53], offset); w[62] = amd_bytealign (w[51], w[52], offset); w[61] = amd_bytealign (w[50], w[51], offset); w[60] = amd_bytealign (w[49], w[50], offset); w[59] = amd_bytealign (w[48], w[49], offset); w[58] = amd_bytealign (w[47], w[48], offset); w[57] = amd_bytealign (w[46], w[47], offset); w[56] = amd_bytealign (w[45], w[46], offset); w[55] = amd_bytealign (w[44], w[45], offset); w[54] = amd_bytealign (w[43], w[44], offset); w[53] = amd_bytealign (w[42], w[43], offset); w[52] = amd_bytealign (w[41], w[42], offset); w[51] = amd_bytealign (w[40], w[41], offset); w[50] = amd_bytealign (w[39], w[40], offset); w[49] = amd_bytealign (w[38], w[39], offset); w[48] = amd_bytealign (w[37], w[38], offset); w[47] = amd_bytealign (w[36], w[37], offset); w[46] = amd_bytealign (w[35], w[36], offset); w[45] = amd_bytealign (w[34], w[35], offset); w[44] = amd_bytealign (w[33], w[34], offset); w[43] = amd_bytealign (w[32], w[33], offset); w[42] = amd_bytealign (w[31], w[32], offset); w[41] = amd_bytealign (w[30], w[31], offset); w[40] = amd_bytealign (w[29], w[30], offset); w[39] = amd_bytealign (w[28], w[29], offset); w[38] = amd_bytealign (w[27], w[28], offset); w[37] = amd_bytealign (w[26], w[27], offset); w[36] = amd_bytealign (w[25], w[26], offset); w[35] = amd_bytealign (w[24], w[25], offset); w[34] = amd_bytealign (w[23], w[24], offset); w[33] = amd_bytealign (w[22], w[23], offset); w[32] = amd_bytealign (w[21], w[22], offset); w[31] = amd_bytealign (w[20], w[21], offset); w[30] = amd_bytealign (w[19], w[20], offset); w[29] = amd_bytealign (w[18], w[19], offset); w[28] = amd_bytealign (w[17], w[18], offset); w[27] = amd_bytealign (w[16], w[17], offset); w[26] = amd_bytealign (w[15], w[16], offset); w[25] = amd_bytealign (w[14], w[15], offset); w[24] = amd_bytealign (w[13], w[14], offset); w[23] = amd_bytealign (w[12], w[13], offset); w[22] = amd_bytealign (w[11], w[12], offset); w[21] = amd_bytealign (w[10], w[11], offset); w[20] = amd_bytealign (w[ 9], w[10], offset); w[19] = amd_bytealign (w[ 8], w[ 9], offset); w[18] = amd_bytealign (w[ 7], w[ 8], offset); w[17] = amd_bytealign (w[ 6], w[ 7], offset); w[16] = amd_bytealign (w[ 5], w[ 6], offset); w[15] = amd_bytealign (w[ 4], w[ 5], offset); w[14] = amd_bytealign (w[ 3], w[ 4], offset); w[13] = amd_bytealign (w[ 2], w[ 3], offset); w[12] = amd_bytealign (w[ 1], w[ 2], offset); w[11] = amd_bytealign (w[ 0], w[ 1], offset); w[10] = amd_bytealign ( 0, w[ 0], offset); w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 11: w[63] = amd_bytealign (w[51], w[52], offset); w[62] = amd_bytealign (w[50], w[51], offset); w[61] = amd_bytealign (w[49], w[50], offset); w[60] = amd_bytealign (w[48], w[49], offset); w[59] = amd_bytealign (w[47], w[48], offset); w[58] = amd_bytealign (w[46], w[47], offset); w[57] = amd_bytealign (w[45], w[46], offset); w[56] = amd_bytealign (w[44], w[45], offset); w[55] = amd_bytealign (w[43], w[44], offset); w[54] = amd_bytealign (w[42], w[43], offset); w[53] = amd_bytealign (w[41], w[42], offset); w[52] = amd_bytealign (w[40], w[41], offset); w[51] = amd_bytealign (w[39], w[40], offset); w[50] = amd_bytealign (w[38], w[39], offset); w[49] = amd_bytealign (w[37], w[38], offset); w[48] = amd_bytealign (w[36], w[37], offset); w[47] = amd_bytealign (w[35], w[36], offset); w[46] = amd_bytealign (w[34], w[35], offset); w[45] = amd_bytealign (w[33], w[34], offset); w[44] = amd_bytealign (w[32], w[33], offset); w[43] = amd_bytealign (w[31], w[32], offset); w[42] = amd_bytealign (w[30], w[31], offset); w[41] = amd_bytealign (w[29], w[30], offset); w[40] = amd_bytealign (w[28], w[29], offset); w[39] = amd_bytealign (w[27], w[28], offset); w[38] = amd_bytealign (w[26], w[27], offset); w[37] = amd_bytealign (w[25], w[26], offset); w[36] = amd_bytealign (w[24], w[25], offset); w[35] = amd_bytealign (w[23], w[24], offset); w[34] = amd_bytealign (w[22], w[23], offset); w[33] = amd_bytealign (w[21], w[22], offset); w[32] = amd_bytealign (w[20], w[21], offset); w[31] = amd_bytealign (w[19], w[20], offset); w[30] = amd_bytealign (w[18], w[19], offset); w[29] = amd_bytealign (w[17], w[18], offset); w[28] = amd_bytealign (w[16], w[17], offset); w[27] = amd_bytealign (w[15], w[16], offset); w[26] = amd_bytealign (w[14], w[15], offset); w[25] = amd_bytealign (w[13], w[14], offset); w[24] = amd_bytealign (w[12], w[13], offset); w[23] = amd_bytealign (w[11], w[12], offset); w[22] = amd_bytealign (w[10], w[11], offset); w[21] = amd_bytealign (w[ 9], w[10], offset); w[20] = amd_bytealign (w[ 8], w[ 9], offset); w[19] = amd_bytealign (w[ 7], w[ 8], offset); w[18] = amd_bytealign (w[ 6], w[ 7], offset); w[17] = amd_bytealign (w[ 5], w[ 6], offset); w[16] = amd_bytealign (w[ 4], w[ 5], offset); w[15] = amd_bytealign (w[ 3], w[ 4], offset); w[14] = amd_bytealign (w[ 2], w[ 3], offset); w[13] = amd_bytealign (w[ 1], w[ 2], offset); w[12] = amd_bytealign (w[ 0], w[ 1], offset); w[11] = amd_bytealign ( 0, w[ 0], offset); w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 12: w[63] = amd_bytealign (w[50], w[51], offset); w[62] = amd_bytealign (w[49], w[50], offset); w[61] = amd_bytealign (w[48], w[49], offset); w[60] = amd_bytealign (w[47], w[48], offset); w[59] = amd_bytealign (w[46], w[47], offset); w[58] = amd_bytealign (w[45], w[46], offset); w[57] = amd_bytealign (w[44], w[45], offset); w[56] = amd_bytealign (w[43], w[44], offset); w[55] = amd_bytealign (w[42], w[43], offset); w[54] = amd_bytealign (w[41], w[42], offset); w[53] = amd_bytealign (w[40], w[41], offset); w[52] = amd_bytealign (w[39], w[40], offset); w[51] = amd_bytealign (w[38], w[39], offset); w[50] = amd_bytealign (w[37], w[38], offset); w[49] = amd_bytealign (w[36], w[37], offset); w[48] = amd_bytealign (w[35], w[36], offset); w[47] = amd_bytealign (w[34], w[35], offset); w[46] = amd_bytealign (w[33], w[34], offset); w[45] = amd_bytealign (w[32], w[33], offset); w[44] = amd_bytealign (w[31], w[32], offset); w[43] = amd_bytealign (w[30], w[31], offset); w[42] = amd_bytealign (w[29], w[30], offset); w[41] = amd_bytealign (w[28], w[29], offset); w[40] = amd_bytealign (w[27], w[28], offset); w[39] = amd_bytealign (w[26], w[27], offset); w[38] = amd_bytealign (w[25], w[26], offset); w[37] = amd_bytealign (w[24], w[25], offset); w[36] = amd_bytealign (w[23], w[24], offset); w[35] = amd_bytealign (w[22], w[23], offset); w[34] = amd_bytealign (w[21], w[22], offset); w[33] = amd_bytealign (w[20], w[21], offset); w[32] = amd_bytealign (w[19], w[20], offset); w[31] = amd_bytealign (w[18], w[19], offset); w[30] = amd_bytealign (w[17], w[18], offset); w[29] = amd_bytealign (w[16], w[17], offset); w[28] = amd_bytealign (w[15], w[16], offset); w[27] = amd_bytealign (w[14], w[15], offset); w[26] = amd_bytealign (w[13], w[14], offset); w[25] = amd_bytealign (w[12], w[13], offset); w[24] = amd_bytealign (w[11], w[12], offset); w[23] = amd_bytealign (w[10], w[11], offset); w[22] = amd_bytealign (w[ 9], w[10], offset); w[21] = amd_bytealign (w[ 8], w[ 9], offset); w[20] = amd_bytealign (w[ 7], w[ 8], offset); w[19] = amd_bytealign (w[ 6], w[ 7], offset); w[18] = amd_bytealign (w[ 5], w[ 6], offset); w[17] = amd_bytealign (w[ 4], w[ 5], offset); w[16] = amd_bytealign (w[ 3], w[ 4], offset); w[15] = amd_bytealign (w[ 2], w[ 3], offset); w[14] = amd_bytealign (w[ 1], w[ 2], offset); w[13] = amd_bytealign (w[ 0], w[ 1], offset); w[12] = amd_bytealign ( 0, w[ 0], offset); w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 13: w[63] = amd_bytealign (w[49], w[50], offset); w[62] = amd_bytealign (w[48], w[49], offset); w[61] = amd_bytealign (w[47], w[48], offset); w[60] = amd_bytealign (w[46], w[47], offset); w[59] = amd_bytealign (w[45], w[46], offset); w[58] = amd_bytealign (w[44], w[45], offset); w[57] = amd_bytealign (w[43], w[44], offset); w[56] = amd_bytealign (w[42], w[43], offset); w[55] = amd_bytealign (w[41], w[42], offset); w[54] = amd_bytealign (w[40], w[41], offset); w[53] = amd_bytealign (w[39], w[40], offset); w[52] = amd_bytealign (w[38], w[39], offset); w[51] = amd_bytealign (w[37], w[38], offset); w[50] = amd_bytealign (w[36], w[37], offset); w[49] = amd_bytealign (w[35], w[36], offset); w[48] = amd_bytealign (w[34], w[35], offset); w[47] = amd_bytealign (w[33], w[34], offset); w[46] = amd_bytealign (w[32], w[33], offset); w[45] = amd_bytealign (w[31], w[32], offset); w[44] = amd_bytealign (w[30], w[31], offset); w[43] = amd_bytealign (w[29], w[30], offset); w[42] = amd_bytealign (w[28], w[29], offset); w[41] = amd_bytealign (w[27], w[28], offset); w[40] = amd_bytealign (w[26], w[27], offset); w[39] = amd_bytealign (w[25], w[26], offset); w[38] = amd_bytealign (w[24], w[25], offset); w[37] = amd_bytealign (w[23], w[24], offset); w[36] = amd_bytealign (w[22], w[23], offset); w[35] = amd_bytealign (w[21], w[22], offset); w[34] = amd_bytealign (w[20], w[21], offset); w[33] = amd_bytealign (w[19], w[20], offset); w[32] = amd_bytealign (w[18], w[19], offset); w[31] = amd_bytealign (w[17], w[18], offset); w[30] = amd_bytealign (w[16], w[17], offset); w[29] = amd_bytealign (w[15], w[16], offset); w[28] = amd_bytealign (w[14], w[15], offset); w[27] = amd_bytealign (w[13], w[14], offset); w[26] = amd_bytealign (w[12], w[13], offset); w[25] = amd_bytealign (w[11], w[12], offset); w[24] = amd_bytealign (w[10], w[11], offset); w[23] = amd_bytealign (w[ 9], w[10], offset); w[22] = amd_bytealign (w[ 8], w[ 9], offset); w[21] = amd_bytealign (w[ 7], w[ 8], offset); w[20] = amd_bytealign (w[ 6], w[ 7], offset); w[19] = amd_bytealign (w[ 5], w[ 6], offset); w[18] = amd_bytealign (w[ 4], w[ 5], offset); w[17] = amd_bytealign (w[ 3], w[ 4], offset); w[16] = amd_bytealign (w[ 2], w[ 3], offset); w[15] = amd_bytealign (w[ 1], w[ 2], offset); w[14] = amd_bytealign (w[ 0], w[ 1], offset); w[13] = amd_bytealign ( 0, w[ 0], offset); w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 14: w[63] = amd_bytealign (w[48], w[49], offset); w[62] = amd_bytealign (w[47], w[48], offset); w[61] = amd_bytealign (w[46], w[47], offset); w[60] = amd_bytealign (w[45], w[46], offset); w[59] = amd_bytealign (w[44], w[45], offset); w[58] = amd_bytealign (w[43], w[44], offset); w[57] = amd_bytealign (w[42], w[43], offset); w[56] = amd_bytealign (w[41], w[42], offset); w[55] = amd_bytealign (w[40], w[41], offset); w[54] = amd_bytealign (w[39], w[40], offset); w[53] = amd_bytealign (w[38], w[39], offset); w[52] = amd_bytealign (w[37], w[38], offset); w[51] = amd_bytealign (w[36], w[37], offset); w[50] = amd_bytealign (w[35], w[36], offset); w[49] = amd_bytealign (w[34], w[35], offset); w[48] = amd_bytealign (w[33], w[34], offset); w[47] = amd_bytealign (w[32], w[33], offset); w[46] = amd_bytealign (w[31], w[32], offset); w[45] = amd_bytealign (w[30], w[31], offset); w[44] = amd_bytealign (w[29], w[30], offset); w[43] = amd_bytealign (w[28], w[29], offset); w[42] = amd_bytealign (w[27], w[28], offset); w[41] = amd_bytealign (w[26], w[27], offset); w[40] = amd_bytealign (w[25], w[26], offset); w[39] = amd_bytealign (w[24], w[25], offset); w[38] = amd_bytealign (w[23], w[24], offset); w[37] = amd_bytealign (w[22], w[23], offset); w[36] = amd_bytealign (w[21], w[22], offset); w[35] = amd_bytealign (w[20], w[21], offset); w[34] = amd_bytealign (w[19], w[20], offset); w[33] = amd_bytealign (w[18], w[19], offset); w[32] = amd_bytealign (w[17], w[18], offset); w[31] = amd_bytealign (w[16], w[17], offset); w[30] = amd_bytealign (w[15], w[16], offset); w[29] = amd_bytealign (w[14], w[15], offset); w[28] = amd_bytealign (w[13], w[14], offset); w[27] = amd_bytealign (w[12], w[13], offset); w[26] = amd_bytealign (w[11], w[12], offset); w[25] = amd_bytealign (w[10], w[11], offset); w[24] = amd_bytealign (w[ 9], w[10], offset); w[23] = amd_bytealign (w[ 8], w[ 9], offset); w[22] = amd_bytealign (w[ 7], w[ 8], offset); w[21] = amd_bytealign (w[ 6], w[ 7], offset); w[20] = amd_bytealign (w[ 5], w[ 6], offset); w[19] = amd_bytealign (w[ 4], w[ 5], offset); w[18] = amd_bytealign (w[ 3], w[ 4], offset); w[17] = amd_bytealign (w[ 2], w[ 3], offset); w[16] = amd_bytealign (w[ 1], w[ 2], offset); w[15] = amd_bytealign (w[ 0], w[ 1], offset); w[14] = amd_bytealign ( 0, w[ 0], offset); w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 15: w[63] = amd_bytealign (w[47], w[48], offset); w[62] = amd_bytealign (w[46], w[47], offset); w[61] = amd_bytealign (w[45], w[46], offset); w[60] = amd_bytealign (w[44], w[45], offset); w[59] = amd_bytealign (w[43], w[44], offset); w[58] = amd_bytealign (w[42], w[43], offset); w[57] = amd_bytealign (w[41], w[42], offset); w[56] = amd_bytealign (w[40], w[41], offset); w[55] = amd_bytealign (w[39], w[40], offset); w[54] = amd_bytealign (w[38], w[39], offset); w[53] = amd_bytealign (w[37], w[38], offset); w[52] = amd_bytealign (w[36], w[37], offset); w[51] = amd_bytealign (w[35], w[36], offset); w[50] = amd_bytealign (w[34], w[35], offset); w[49] = amd_bytealign (w[33], w[34], offset); w[48] = amd_bytealign (w[32], w[33], offset); w[47] = amd_bytealign (w[31], w[32], offset); w[46] = amd_bytealign (w[30], w[31], offset); w[45] = amd_bytealign (w[29], w[30], offset); w[44] = amd_bytealign (w[28], w[29], offset); w[43] = amd_bytealign (w[27], w[28], offset); w[42] = amd_bytealign (w[26], w[27], offset); w[41] = amd_bytealign (w[25], w[26], offset); w[40] = amd_bytealign (w[24], w[25], offset); w[39] = amd_bytealign (w[23], w[24], offset); w[38] = amd_bytealign (w[22], w[23], offset); w[37] = amd_bytealign (w[21], w[22], offset); w[36] = amd_bytealign (w[20], w[21], offset); w[35] = amd_bytealign (w[19], w[20], offset); w[34] = amd_bytealign (w[18], w[19], offset); w[33] = amd_bytealign (w[17], w[18], offset); w[32] = amd_bytealign (w[16], w[17], offset); w[31] = amd_bytealign (w[15], w[16], offset); w[30] = amd_bytealign (w[14], w[15], offset); w[29] = amd_bytealign (w[13], w[14], offset); w[28] = amd_bytealign (w[12], w[13], offset); w[27] = amd_bytealign (w[11], w[12], offset); w[26] = amd_bytealign (w[10], w[11], offset); w[25] = amd_bytealign (w[ 9], w[10], offset); w[24] = amd_bytealign (w[ 8], w[ 9], offset); w[23] = amd_bytealign (w[ 7], w[ 8], offset); w[22] = amd_bytealign (w[ 6], w[ 7], offset); w[21] = amd_bytealign (w[ 5], w[ 6], offset); w[20] = amd_bytealign (w[ 4], w[ 5], offset); w[19] = amd_bytealign (w[ 3], w[ 4], offset); w[18] = amd_bytealign (w[ 2], w[ 3], offset); w[17] = amd_bytealign (w[ 1], w[ 2], offset); w[16] = amd_bytealign (w[ 0], w[ 1], offset); w[15] = amd_bytealign ( 0, w[ 0], offset); w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 16: w[63] = amd_bytealign (w[46], w[47], offset); w[62] = amd_bytealign (w[45], w[46], offset); w[61] = amd_bytealign (w[44], w[45], offset); w[60] = amd_bytealign (w[43], w[44], offset); w[59] = amd_bytealign (w[42], w[43], offset); w[58] = amd_bytealign (w[41], w[42], offset); w[57] = amd_bytealign (w[40], w[41], offset); w[56] = amd_bytealign (w[39], w[40], offset); w[55] = amd_bytealign (w[38], w[39], offset); w[54] = amd_bytealign (w[37], w[38], offset); w[53] = amd_bytealign (w[36], w[37], offset); w[52] = amd_bytealign (w[35], w[36], offset); w[51] = amd_bytealign (w[34], w[35], offset); w[50] = amd_bytealign (w[33], w[34], offset); w[49] = amd_bytealign (w[32], w[33], offset); w[48] = amd_bytealign (w[31], w[32], offset); w[47] = amd_bytealign (w[30], w[31], offset); w[46] = amd_bytealign (w[29], w[30], offset); w[45] = amd_bytealign (w[28], w[29], offset); w[44] = amd_bytealign (w[27], w[28], offset); w[43] = amd_bytealign (w[26], w[27], offset); w[42] = amd_bytealign (w[25], w[26], offset); w[41] = amd_bytealign (w[24], w[25], offset); w[40] = amd_bytealign (w[23], w[24], offset); w[39] = amd_bytealign (w[22], w[23], offset); w[38] = amd_bytealign (w[21], w[22], offset); w[37] = amd_bytealign (w[20], w[21], offset); w[36] = amd_bytealign (w[19], w[20], offset); w[35] = amd_bytealign (w[18], w[19], offset); w[34] = amd_bytealign (w[17], w[18], offset); w[33] = amd_bytealign (w[16], w[17], offset); w[32] = amd_bytealign (w[15], w[16], offset); w[31] = amd_bytealign (w[14], w[15], offset); w[30] = amd_bytealign (w[13], w[14], offset); w[29] = amd_bytealign (w[12], w[13], offset); w[28] = amd_bytealign (w[11], w[12], offset); w[27] = amd_bytealign (w[10], w[11], offset); w[26] = amd_bytealign (w[ 9], w[10], offset); w[25] = amd_bytealign (w[ 8], w[ 9], offset); w[24] = amd_bytealign (w[ 7], w[ 8], offset); w[23] = amd_bytealign (w[ 6], w[ 7], offset); w[22] = amd_bytealign (w[ 5], w[ 6], offset); w[21] = amd_bytealign (w[ 4], w[ 5], offset); w[20] = amd_bytealign (w[ 3], w[ 4], offset); w[19] = amd_bytealign (w[ 2], w[ 3], offset); w[18] = amd_bytealign (w[ 1], w[ 2], offset); w[17] = amd_bytealign (w[ 0], w[ 1], offset); w[16] = amd_bytealign ( 0, w[ 0], offset); w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 17: w[63] = amd_bytealign (w[45], w[46], offset); w[62] = amd_bytealign (w[44], w[45], offset); w[61] = amd_bytealign (w[43], w[44], offset); w[60] = amd_bytealign (w[42], w[43], offset); w[59] = amd_bytealign (w[41], w[42], offset); w[58] = amd_bytealign (w[40], w[41], offset); w[57] = amd_bytealign (w[39], w[40], offset); w[56] = amd_bytealign (w[38], w[39], offset); w[55] = amd_bytealign (w[37], w[38], offset); w[54] = amd_bytealign (w[36], w[37], offset); w[53] = amd_bytealign (w[35], w[36], offset); w[52] = amd_bytealign (w[34], w[35], offset); w[51] = amd_bytealign (w[33], w[34], offset); w[50] = amd_bytealign (w[32], w[33], offset); w[49] = amd_bytealign (w[31], w[32], offset); w[48] = amd_bytealign (w[30], w[31], offset); w[47] = amd_bytealign (w[29], w[30], offset); w[46] = amd_bytealign (w[28], w[29], offset); w[45] = amd_bytealign (w[27], w[28], offset); w[44] = amd_bytealign (w[26], w[27], offset); w[43] = amd_bytealign (w[25], w[26], offset); w[42] = amd_bytealign (w[24], w[25], offset); w[41] = amd_bytealign (w[23], w[24], offset); w[40] = amd_bytealign (w[22], w[23], offset); w[39] = amd_bytealign (w[21], w[22], offset); w[38] = amd_bytealign (w[20], w[21], offset); w[37] = amd_bytealign (w[19], w[20], offset); w[36] = amd_bytealign (w[18], w[19], offset); w[35] = amd_bytealign (w[17], w[18], offset); w[34] = amd_bytealign (w[16], w[17], offset); w[33] = amd_bytealign (w[15], w[16], offset); w[32] = amd_bytealign (w[14], w[15], offset); w[31] = amd_bytealign (w[13], w[14], offset); w[30] = amd_bytealign (w[12], w[13], offset); w[29] = amd_bytealign (w[11], w[12], offset); w[28] = amd_bytealign (w[10], w[11], offset); w[27] = amd_bytealign (w[ 9], w[10], offset); w[26] = amd_bytealign (w[ 8], w[ 9], offset); w[25] = amd_bytealign (w[ 7], w[ 8], offset); w[24] = amd_bytealign (w[ 6], w[ 7], offset); w[23] = amd_bytealign (w[ 5], w[ 6], offset); w[22] = amd_bytealign (w[ 4], w[ 5], offset); w[21] = amd_bytealign (w[ 3], w[ 4], offset); w[20] = amd_bytealign (w[ 2], w[ 3], offset); w[19] = amd_bytealign (w[ 1], w[ 2], offset); w[18] = amd_bytealign (w[ 0], w[ 1], offset); w[17] = amd_bytealign ( 0, w[ 0], offset); w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 18: w[63] = amd_bytealign (w[44], w[45], offset); w[62] = amd_bytealign (w[43], w[44], offset); w[61] = amd_bytealign (w[42], w[43], offset); w[60] = amd_bytealign (w[41], w[42], offset); w[59] = amd_bytealign (w[40], w[41], offset); w[58] = amd_bytealign (w[39], w[40], offset); w[57] = amd_bytealign (w[38], w[39], offset); w[56] = amd_bytealign (w[37], w[38], offset); w[55] = amd_bytealign (w[36], w[37], offset); w[54] = amd_bytealign (w[35], w[36], offset); w[53] = amd_bytealign (w[34], w[35], offset); w[52] = amd_bytealign (w[33], w[34], offset); w[51] = amd_bytealign (w[32], w[33], offset); w[50] = amd_bytealign (w[31], w[32], offset); w[49] = amd_bytealign (w[30], w[31], offset); w[48] = amd_bytealign (w[29], w[30], offset); w[47] = amd_bytealign (w[28], w[29], offset); w[46] = amd_bytealign (w[27], w[28], offset); w[45] = amd_bytealign (w[26], w[27], offset); w[44] = amd_bytealign (w[25], w[26], offset); w[43] = amd_bytealign (w[24], w[25], offset); w[42] = amd_bytealign (w[23], w[24], offset); w[41] = amd_bytealign (w[22], w[23], offset); w[40] = amd_bytealign (w[21], w[22], offset); w[39] = amd_bytealign (w[20], w[21], offset); w[38] = amd_bytealign (w[19], w[20], offset); w[37] = amd_bytealign (w[18], w[19], offset); w[36] = amd_bytealign (w[17], w[18], offset); w[35] = amd_bytealign (w[16], w[17], offset); w[34] = amd_bytealign (w[15], w[16], offset); w[33] = amd_bytealign (w[14], w[15], offset); w[32] = amd_bytealign (w[13], w[14], offset); w[31] = amd_bytealign (w[12], w[13], offset); w[30] = amd_bytealign (w[11], w[12], offset); w[29] = amd_bytealign (w[10], w[11], offset); w[28] = amd_bytealign (w[ 9], w[10], offset); w[27] = amd_bytealign (w[ 8], w[ 9], offset); w[26] = amd_bytealign (w[ 7], w[ 8], offset); w[25] = amd_bytealign (w[ 6], w[ 7], offset); w[24] = amd_bytealign (w[ 5], w[ 6], offset); w[23] = amd_bytealign (w[ 4], w[ 5], offset); w[22] = amd_bytealign (w[ 3], w[ 4], offset); w[21] = amd_bytealign (w[ 2], w[ 3], offset); w[20] = amd_bytealign (w[ 1], w[ 2], offset); w[19] = amd_bytealign (w[ 0], w[ 1], offset); w[18] = amd_bytealign ( 0, w[ 0], offset); w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 19: w[63] = amd_bytealign (w[43], w[44], offset); w[62] = amd_bytealign (w[42], w[43], offset); w[61] = amd_bytealign (w[41], w[42], offset); w[60] = amd_bytealign (w[40], w[41], offset); w[59] = amd_bytealign (w[39], w[40], offset); w[58] = amd_bytealign (w[38], w[39], offset); w[57] = amd_bytealign (w[37], w[38], offset); w[56] = amd_bytealign (w[36], w[37], offset); w[55] = amd_bytealign (w[35], w[36], offset); w[54] = amd_bytealign (w[34], w[35], offset); w[53] = amd_bytealign (w[33], w[34], offset); w[52] = amd_bytealign (w[32], w[33], offset); w[51] = amd_bytealign (w[31], w[32], offset); w[50] = amd_bytealign (w[30], w[31], offset); w[49] = amd_bytealign (w[29], w[30], offset); w[48] = amd_bytealign (w[28], w[29], offset); w[47] = amd_bytealign (w[27], w[28], offset); w[46] = amd_bytealign (w[26], w[27], offset); w[45] = amd_bytealign (w[25], w[26], offset); w[44] = amd_bytealign (w[24], w[25], offset); w[43] = amd_bytealign (w[23], w[24], offset); w[42] = amd_bytealign (w[22], w[23], offset); w[41] = amd_bytealign (w[21], w[22], offset); w[40] = amd_bytealign (w[20], w[21], offset); w[39] = amd_bytealign (w[19], w[20], offset); w[38] = amd_bytealign (w[18], w[19], offset); w[37] = amd_bytealign (w[17], w[18], offset); w[36] = amd_bytealign (w[16], w[17], offset); w[35] = amd_bytealign (w[15], w[16], offset); w[34] = amd_bytealign (w[14], w[15], offset); w[33] = amd_bytealign (w[13], w[14], offset); w[32] = amd_bytealign (w[12], w[13], offset); w[31] = amd_bytealign (w[11], w[12], offset); w[30] = amd_bytealign (w[10], w[11], offset); w[29] = amd_bytealign (w[ 9], w[10], offset); w[28] = amd_bytealign (w[ 8], w[ 9], offset); w[27] = amd_bytealign (w[ 7], w[ 8], offset); w[26] = amd_bytealign (w[ 6], w[ 7], offset); w[25] = amd_bytealign (w[ 5], w[ 6], offset); w[24] = amd_bytealign (w[ 4], w[ 5], offset); w[23] = amd_bytealign (w[ 3], w[ 4], offset); w[22] = amd_bytealign (w[ 2], w[ 3], offset); w[21] = amd_bytealign (w[ 1], w[ 2], offset); w[20] = amd_bytealign (w[ 0], w[ 1], offset); w[19] = amd_bytealign ( 0, w[ 0], offset); w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 20: w[63] = amd_bytealign (w[42], w[43], offset); w[62] = amd_bytealign (w[41], w[42], offset); w[61] = amd_bytealign (w[40], w[41], offset); w[60] = amd_bytealign (w[39], w[40], offset); w[59] = amd_bytealign (w[38], w[39], offset); w[58] = amd_bytealign (w[37], w[38], offset); w[57] = amd_bytealign (w[36], w[37], offset); w[56] = amd_bytealign (w[35], w[36], offset); w[55] = amd_bytealign (w[34], w[35], offset); w[54] = amd_bytealign (w[33], w[34], offset); w[53] = amd_bytealign (w[32], w[33], offset); w[52] = amd_bytealign (w[31], w[32], offset); w[51] = amd_bytealign (w[30], w[31], offset); w[50] = amd_bytealign (w[29], w[30], offset); w[49] = amd_bytealign (w[28], w[29], offset); w[48] = amd_bytealign (w[27], w[28], offset); w[47] = amd_bytealign (w[26], w[27], offset); w[46] = amd_bytealign (w[25], w[26], offset); w[45] = amd_bytealign (w[24], w[25], offset); w[44] = amd_bytealign (w[23], w[24], offset); w[43] = amd_bytealign (w[22], w[23], offset); w[42] = amd_bytealign (w[21], w[22], offset); w[41] = amd_bytealign (w[20], w[21], offset); w[40] = amd_bytealign (w[19], w[20], offset); w[39] = amd_bytealign (w[18], w[19], offset); w[38] = amd_bytealign (w[17], w[18], offset); w[37] = amd_bytealign (w[16], w[17], offset); w[36] = amd_bytealign (w[15], w[16], offset); w[35] = amd_bytealign (w[14], w[15], offset); w[34] = amd_bytealign (w[13], w[14], offset); w[33] = amd_bytealign (w[12], w[13], offset); w[32] = amd_bytealign (w[11], w[12], offset); w[31] = amd_bytealign (w[10], w[11], offset); w[30] = amd_bytealign (w[ 9], w[10], offset); w[29] = amd_bytealign (w[ 8], w[ 9], offset); w[28] = amd_bytealign (w[ 7], w[ 8], offset); w[27] = amd_bytealign (w[ 6], w[ 7], offset); w[26] = amd_bytealign (w[ 5], w[ 6], offset); w[25] = amd_bytealign (w[ 4], w[ 5], offset); w[24] = amd_bytealign (w[ 3], w[ 4], offset); w[23] = amd_bytealign (w[ 2], w[ 3], offset); w[22] = amd_bytealign (w[ 1], w[ 2], offset); w[21] = amd_bytealign (w[ 0], w[ 1], offset); w[20] = amd_bytealign ( 0, w[ 0], offset); w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 21: w[63] = amd_bytealign (w[41], w[42], offset); w[62] = amd_bytealign (w[40], w[41], offset); w[61] = amd_bytealign (w[39], w[40], offset); w[60] = amd_bytealign (w[38], w[39], offset); w[59] = amd_bytealign (w[37], w[38], offset); w[58] = amd_bytealign (w[36], w[37], offset); w[57] = amd_bytealign (w[35], w[36], offset); w[56] = amd_bytealign (w[34], w[35], offset); w[55] = amd_bytealign (w[33], w[34], offset); w[54] = amd_bytealign (w[32], w[33], offset); w[53] = amd_bytealign (w[31], w[32], offset); w[52] = amd_bytealign (w[30], w[31], offset); w[51] = amd_bytealign (w[29], w[30], offset); w[50] = amd_bytealign (w[28], w[29], offset); w[49] = amd_bytealign (w[27], w[28], offset); w[48] = amd_bytealign (w[26], w[27], offset); w[47] = amd_bytealign (w[25], w[26], offset); w[46] = amd_bytealign (w[24], w[25], offset); w[45] = amd_bytealign (w[23], w[24], offset); w[44] = amd_bytealign (w[22], w[23], offset); w[43] = amd_bytealign (w[21], w[22], offset); w[42] = amd_bytealign (w[20], w[21], offset); w[41] = amd_bytealign (w[19], w[20], offset); w[40] = amd_bytealign (w[18], w[19], offset); w[39] = amd_bytealign (w[17], w[18], offset); w[38] = amd_bytealign (w[16], w[17], offset); w[37] = amd_bytealign (w[15], w[16], offset); w[36] = amd_bytealign (w[14], w[15], offset); w[35] = amd_bytealign (w[13], w[14], offset); w[34] = amd_bytealign (w[12], w[13], offset); w[33] = amd_bytealign (w[11], w[12], offset); w[32] = amd_bytealign (w[10], w[11], offset); w[31] = amd_bytealign (w[ 9], w[10], offset); w[30] = amd_bytealign (w[ 8], w[ 9], offset); w[29] = amd_bytealign (w[ 7], w[ 8], offset); w[28] = amd_bytealign (w[ 6], w[ 7], offset); w[27] = amd_bytealign (w[ 5], w[ 6], offset); w[26] = amd_bytealign (w[ 4], w[ 5], offset); w[25] = amd_bytealign (w[ 3], w[ 4], offset); w[24] = amd_bytealign (w[ 2], w[ 3], offset); w[23] = amd_bytealign (w[ 1], w[ 2], offset); w[22] = amd_bytealign (w[ 0], w[ 1], offset); w[21] = amd_bytealign ( 0, w[ 0], offset); w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 22: w[63] = amd_bytealign (w[40], w[41], offset); w[62] = amd_bytealign (w[39], w[40], offset); w[61] = amd_bytealign (w[38], w[39], offset); w[60] = amd_bytealign (w[37], w[38], offset); w[59] = amd_bytealign (w[36], w[37], offset); w[58] = amd_bytealign (w[35], w[36], offset); w[57] = amd_bytealign (w[34], w[35], offset); w[56] = amd_bytealign (w[33], w[34], offset); w[55] = amd_bytealign (w[32], w[33], offset); w[54] = amd_bytealign (w[31], w[32], offset); w[53] = amd_bytealign (w[30], w[31], offset); w[52] = amd_bytealign (w[29], w[30], offset); w[51] = amd_bytealign (w[28], w[29], offset); w[50] = amd_bytealign (w[27], w[28], offset); w[49] = amd_bytealign (w[26], w[27], offset); w[48] = amd_bytealign (w[25], w[26], offset); w[47] = amd_bytealign (w[24], w[25], offset); w[46] = amd_bytealign (w[23], w[24], offset); w[45] = amd_bytealign (w[22], w[23], offset); w[44] = amd_bytealign (w[21], w[22], offset); w[43] = amd_bytealign (w[20], w[21], offset); w[42] = amd_bytealign (w[19], w[20], offset); w[41] = amd_bytealign (w[18], w[19], offset); w[40] = amd_bytealign (w[17], w[18], offset); w[39] = amd_bytealign (w[16], w[17], offset); w[38] = amd_bytealign (w[15], w[16], offset); w[37] = amd_bytealign (w[14], w[15], offset); w[36] = amd_bytealign (w[13], w[14], offset); w[35] = amd_bytealign (w[12], w[13], offset); w[34] = amd_bytealign (w[11], w[12], offset); w[33] = amd_bytealign (w[10], w[11], offset); w[32] = amd_bytealign (w[ 9], w[10], offset); w[31] = amd_bytealign (w[ 8], w[ 9], offset); w[30] = amd_bytealign (w[ 7], w[ 8], offset); w[29] = amd_bytealign (w[ 6], w[ 7], offset); w[28] = amd_bytealign (w[ 5], w[ 6], offset); w[27] = amd_bytealign (w[ 4], w[ 5], offset); w[26] = amd_bytealign (w[ 3], w[ 4], offset); w[25] = amd_bytealign (w[ 2], w[ 3], offset); w[24] = amd_bytealign (w[ 1], w[ 2], offset); w[23] = amd_bytealign (w[ 0], w[ 1], offset); w[22] = amd_bytealign ( 0, w[ 0], offset); w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 23: w[63] = amd_bytealign (w[39], w[40], offset); w[62] = amd_bytealign (w[38], w[39], offset); w[61] = amd_bytealign (w[37], w[38], offset); w[60] = amd_bytealign (w[36], w[37], offset); w[59] = amd_bytealign (w[35], w[36], offset); w[58] = amd_bytealign (w[34], w[35], offset); w[57] = amd_bytealign (w[33], w[34], offset); w[56] = amd_bytealign (w[32], w[33], offset); w[55] = amd_bytealign (w[31], w[32], offset); w[54] = amd_bytealign (w[30], w[31], offset); w[53] = amd_bytealign (w[29], w[30], offset); w[52] = amd_bytealign (w[28], w[29], offset); w[51] = amd_bytealign (w[27], w[28], offset); w[50] = amd_bytealign (w[26], w[27], offset); w[49] = amd_bytealign (w[25], w[26], offset); w[48] = amd_bytealign (w[24], w[25], offset); w[47] = amd_bytealign (w[23], w[24], offset); w[46] = amd_bytealign (w[22], w[23], offset); w[45] = amd_bytealign (w[21], w[22], offset); w[44] = amd_bytealign (w[20], w[21], offset); w[43] = amd_bytealign (w[19], w[20], offset); w[42] = amd_bytealign (w[18], w[19], offset); w[41] = amd_bytealign (w[17], w[18], offset); w[40] = amd_bytealign (w[16], w[17], offset); w[39] = amd_bytealign (w[15], w[16], offset); w[38] = amd_bytealign (w[14], w[15], offset); w[37] = amd_bytealign (w[13], w[14], offset); w[36] = amd_bytealign (w[12], w[13], offset); w[35] = amd_bytealign (w[11], w[12], offset); w[34] = amd_bytealign (w[10], w[11], offset); w[33] = amd_bytealign (w[ 9], w[10], offset); w[32] = amd_bytealign (w[ 8], w[ 9], offset); w[31] = amd_bytealign (w[ 7], w[ 8], offset); w[30] = amd_bytealign (w[ 6], w[ 7], offset); w[29] = amd_bytealign (w[ 5], w[ 6], offset); w[28] = amd_bytealign (w[ 4], w[ 5], offset); w[27] = amd_bytealign (w[ 3], w[ 4], offset); w[26] = amd_bytealign (w[ 2], w[ 3], offset); w[25] = amd_bytealign (w[ 1], w[ 2], offset); w[24] = amd_bytealign (w[ 0], w[ 1], offset); w[23] = amd_bytealign ( 0, w[ 0], offset); w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 24: w[63] = amd_bytealign (w[38], w[39], offset); w[62] = amd_bytealign (w[37], w[38], offset); w[61] = amd_bytealign (w[36], w[37], offset); w[60] = amd_bytealign (w[35], w[36], offset); w[59] = amd_bytealign (w[34], w[35], offset); w[58] = amd_bytealign (w[33], w[34], offset); w[57] = amd_bytealign (w[32], w[33], offset); w[56] = amd_bytealign (w[31], w[32], offset); w[55] = amd_bytealign (w[30], w[31], offset); w[54] = amd_bytealign (w[29], w[30], offset); w[53] = amd_bytealign (w[28], w[29], offset); w[52] = amd_bytealign (w[27], w[28], offset); w[51] = amd_bytealign (w[26], w[27], offset); w[50] = amd_bytealign (w[25], w[26], offset); w[49] = amd_bytealign (w[24], w[25], offset); w[48] = amd_bytealign (w[23], w[24], offset); w[47] = amd_bytealign (w[22], w[23], offset); w[46] = amd_bytealign (w[21], w[22], offset); w[45] = amd_bytealign (w[20], w[21], offset); w[44] = amd_bytealign (w[19], w[20], offset); w[43] = amd_bytealign (w[18], w[19], offset); w[42] = amd_bytealign (w[17], w[18], offset); w[41] = amd_bytealign (w[16], w[17], offset); w[40] = amd_bytealign (w[15], w[16], offset); w[39] = amd_bytealign (w[14], w[15], offset); w[38] = amd_bytealign (w[13], w[14], offset); w[37] = amd_bytealign (w[12], w[13], offset); w[36] = amd_bytealign (w[11], w[12], offset); w[35] = amd_bytealign (w[10], w[11], offset); w[34] = amd_bytealign (w[ 9], w[10], offset); w[33] = amd_bytealign (w[ 8], w[ 9], offset); w[32] = amd_bytealign (w[ 7], w[ 8], offset); w[31] = amd_bytealign (w[ 6], w[ 7], offset); w[30] = amd_bytealign (w[ 5], w[ 6], offset); w[29] = amd_bytealign (w[ 4], w[ 5], offset); w[28] = amd_bytealign (w[ 3], w[ 4], offset); w[27] = amd_bytealign (w[ 2], w[ 3], offset); w[26] = amd_bytealign (w[ 1], w[ 2], offset); w[25] = amd_bytealign (w[ 0], w[ 1], offset); w[24] = amd_bytealign ( 0, w[ 0], offset); w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 25: w[63] = amd_bytealign (w[37], w[38], offset); w[62] = amd_bytealign (w[36], w[37], offset); w[61] = amd_bytealign (w[35], w[36], offset); w[60] = amd_bytealign (w[34], w[35], offset); w[59] = amd_bytealign (w[33], w[34], offset); w[58] = amd_bytealign (w[32], w[33], offset); w[57] = amd_bytealign (w[31], w[32], offset); w[56] = amd_bytealign (w[30], w[31], offset); w[55] = amd_bytealign (w[29], w[30], offset); w[54] = amd_bytealign (w[28], w[29], offset); w[53] = amd_bytealign (w[27], w[28], offset); w[52] = amd_bytealign (w[26], w[27], offset); w[51] = amd_bytealign (w[25], w[26], offset); w[50] = amd_bytealign (w[24], w[25], offset); w[49] = amd_bytealign (w[23], w[24], offset); w[48] = amd_bytealign (w[22], w[23], offset); w[47] = amd_bytealign (w[21], w[22], offset); w[46] = amd_bytealign (w[20], w[21], offset); w[45] = amd_bytealign (w[19], w[20], offset); w[44] = amd_bytealign (w[18], w[19], offset); w[43] = amd_bytealign (w[17], w[18], offset); w[42] = amd_bytealign (w[16], w[17], offset); w[41] = amd_bytealign (w[15], w[16], offset); w[40] = amd_bytealign (w[14], w[15], offset); w[39] = amd_bytealign (w[13], w[14], offset); w[38] = amd_bytealign (w[12], w[13], offset); w[37] = amd_bytealign (w[11], w[12], offset); w[36] = amd_bytealign (w[10], w[11], offset); w[35] = amd_bytealign (w[ 9], w[10], offset); w[34] = amd_bytealign (w[ 8], w[ 9], offset); w[33] = amd_bytealign (w[ 7], w[ 8], offset); w[32] = amd_bytealign (w[ 6], w[ 7], offset); w[31] = amd_bytealign (w[ 5], w[ 6], offset); w[30] = amd_bytealign (w[ 4], w[ 5], offset); w[29] = amd_bytealign (w[ 3], w[ 4], offset); w[28] = amd_bytealign (w[ 2], w[ 3], offset); w[27] = amd_bytealign (w[ 1], w[ 2], offset); w[26] = amd_bytealign (w[ 0], w[ 1], offset); w[25] = amd_bytealign ( 0, w[ 0], offset); w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 26: w[63] = amd_bytealign (w[36], w[37], offset); w[62] = amd_bytealign (w[35], w[36], offset); w[61] = amd_bytealign (w[34], w[35], offset); w[60] = amd_bytealign (w[33], w[34], offset); w[59] = amd_bytealign (w[32], w[33], offset); w[58] = amd_bytealign (w[31], w[32], offset); w[57] = amd_bytealign (w[30], w[31], offset); w[56] = amd_bytealign (w[29], w[30], offset); w[55] = amd_bytealign (w[28], w[29], offset); w[54] = amd_bytealign (w[27], w[28], offset); w[53] = amd_bytealign (w[26], w[27], offset); w[52] = amd_bytealign (w[25], w[26], offset); w[51] = amd_bytealign (w[24], w[25], offset); w[50] = amd_bytealign (w[23], w[24], offset); w[49] = amd_bytealign (w[22], w[23], offset); w[48] = amd_bytealign (w[21], w[22], offset); w[47] = amd_bytealign (w[20], w[21], offset); w[46] = amd_bytealign (w[19], w[20], offset); w[45] = amd_bytealign (w[18], w[19], offset); w[44] = amd_bytealign (w[17], w[18], offset); w[43] = amd_bytealign (w[16], w[17], offset); w[42] = amd_bytealign (w[15], w[16], offset); w[41] = amd_bytealign (w[14], w[15], offset); w[40] = amd_bytealign (w[13], w[14], offset); w[39] = amd_bytealign (w[12], w[13], offset); w[38] = amd_bytealign (w[11], w[12], offset); w[37] = amd_bytealign (w[10], w[11], offset); w[36] = amd_bytealign (w[ 9], w[10], offset); w[35] = amd_bytealign (w[ 8], w[ 9], offset); w[34] = amd_bytealign (w[ 7], w[ 8], offset); w[33] = amd_bytealign (w[ 6], w[ 7], offset); w[32] = amd_bytealign (w[ 5], w[ 6], offset); w[31] = amd_bytealign (w[ 4], w[ 5], offset); w[30] = amd_bytealign (w[ 3], w[ 4], offset); w[29] = amd_bytealign (w[ 2], w[ 3], offset); w[28] = amd_bytealign (w[ 1], w[ 2], offset); w[27] = amd_bytealign (w[ 0], w[ 1], offset); w[26] = amd_bytealign ( 0, w[ 0], offset); w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 27: w[63] = amd_bytealign (w[35], w[36], offset); w[62] = amd_bytealign (w[34], w[35], offset); w[61] = amd_bytealign (w[33], w[34], offset); w[60] = amd_bytealign (w[32], w[33], offset); w[59] = amd_bytealign (w[31], w[32], offset); w[58] = amd_bytealign (w[30], w[31], offset); w[57] = amd_bytealign (w[29], w[30], offset); w[56] = amd_bytealign (w[28], w[29], offset); w[55] = amd_bytealign (w[27], w[28], offset); w[54] = amd_bytealign (w[26], w[27], offset); w[53] = amd_bytealign (w[25], w[26], offset); w[52] = amd_bytealign (w[24], w[25], offset); w[51] = amd_bytealign (w[23], w[24], offset); w[50] = amd_bytealign (w[22], w[23], offset); w[49] = amd_bytealign (w[21], w[22], offset); w[48] = amd_bytealign (w[20], w[21], offset); w[47] = amd_bytealign (w[19], w[20], offset); w[46] = amd_bytealign (w[18], w[19], offset); w[45] = amd_bytealign (w[17], w[18], offset); w[44] = amd_bytealign (w[16], w[17], offset); w[43] = amd_bytealign (w[15], w[16], offset); w[42] = amd_bytealign (w[14], w[15], offset); w[41] = amd_bytealign (w[13], w[14], offset); w[40] = amd_bytealign (w[12], w[13], offset); w[39] = amd_bytealign (w[11], w[12], offset); w[38] = amd_bytealign (w[10], w[11], offset); w[37] = amd_bytealign (w[ 9], w[10], offset); w[36] = amd_bytealign (w[ 8], w[ 9], offset); w[35] = amd_bytealign (w[ 7], w[ 8], offset); w[34] = amd_bytealign (w[ 6], w[ 7], offset); w[33] = amd_bytealign (w[ 5], w[ 6], offset); w[32] = amd_bytealign (w[ 4], w[ 5], offset); w[31] = amd_bytealign (w[ 3], w[ 4], offset); w[30] = amd_bytealign (w[ 2], w[ 3], offset); w[29] = amd_bytealign (w[ 1], w[ 2], offset); w[28] = amd_bytealign (w[ 0], w[ 1], offset); w[27] = amd_bytealign ( 0, w[ 0], offset); w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 28: w[63] = amd_bytealign (w[34], w[35], offset); w[62] = amd_bytealign (w[33], w[34], offset); w[61] = amd_bytealign (w[32], w[33], offset); w[60] = amd_bytealign (w[31], w[32], offset); w[59] = amd_bytealign (w[30], w[31], offset); w[58] = amd_bytealign (w[29], w[30], offset); w[57] = amd_bytealign (w[28], w[29], offset); w[56] = amd_bytealign (w[27], w[28], offset); w[55] = amd_bytealign (w[26], w[27], offset); w[54] = amd_bytealign (w[25], w[26], offset); w[53] = amd_bytealign (w[24], w[25], offset); w[52] = amd_bytealign (w[23], w[24], offset); w[51] = amd_bytealign (w[22], w[23], offset); w[50] = amd_bytealign (w[21], w[22], offset); w[49] = amd_bytealign (w[20], w[21], offset); w[48] = amd_bytealign (w[19], w[20], offset); w[47] = amd_bytealign (w[18], w[19], offset); w[46] = amd_bytealign (w[17], w[18], offset); w[45] = amd_bytealign (w[16], w[17], offset); w[44] = amd_bytealign (w[15], w[16], offset); w[43] = amd_bytealign (w[14], w[15], offset); w[42] = amd_bytealign (w[13], w[14], offset); w[41] = amd_bytealign (w[12], w[13], offset); w[40] = amd_bytealign (w[11], w[12], offset); w[39] = amd_bytealign (w[10], w[11], offset); w[38] = amd_bytealign (w[ 9], w[10], offset); w[37] = amd_bytealign (w[ 8], w[ 9], offset); w[36] = amd_bytealign (w[ 7], w[ 8], offset); w[35] = amd_bytealign (w[ 6], w[ 7], offset); w[34] = amd_bytealign (w[ 5], w[ 6], offset); w[33] = amd_bytealign (w[ 4], w[ 5], offset); w[32] = amd_bytealign (w[ 3], w[ 4], offset); w[31] = amd_bytealign (w[ 2], w[ 3], offset); w[30] = amd_bytealign (w[ 1], w[ 2], offset); w[29] = amd_bytealign (w[ 0], w[ 1], offset); w[28] = amd_bytealign ( 0, w[ 0], offset); w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 29: w[63] = amd_bytealign (w[33], w[34], offset); w[62] = amd_bytealign (w[32], w[33], offset); w[61] = amd_bytealign (w[31], w[32], offset); w[60] = amd_bytealign (w[30], w[31], offset); w[59] = amd_bytealign (w[29], w[30], offset); w[58] = amd_bytealign (w[28], w[29], offset); w[57] = amd_bytealign (w[27], w[28], offset); w[56] = amd_bytealign (w[26], w[27], offset); w[55] = amd_bytealign (w[25], w[26], offset); w[54] = amd_bytealign (w[24], w[25], offset); w[53] = amd_bytealign (w[23], w[24], offset); w[52] = amd_bytealign (w[22], w[23], offset); w[51] = amd_bytealign (w[21], w[22], offset); w[50] = amd_bytealign (w[20], w[21], offset); w[49] = amd_bytealign (w[19], w[20], offset); w[48] = amd_bytealign (w[18], w[19], offset); w[47] = amd_bytealign (w[17], w[18], offset); w[46] = amd_bytealign (w[16], w[17], offset); w[45] = amd_bytealign (w[15], w[16], offset); w[44] = amd_bytealign (w[14], w[15], offset); w[43] = amd_bytealign (w[13], w[14], offset); w[42] = amd_bytealign (w[12], w[13], offset); w[41] = amd_bytealign (w[11], w[12], offset); w[40] = amd_bytealign (w[10], w[11], offset); w[39] = amd_bytealign (w[ 9], w[10], offset); w[38] = amd_bytealign (w[ 8], w[ 9], offset); w[37] = amd_bytealign (w[ 7], w[ 8], offset); w[36] = amd_bytealign (w[ 6], w[ 7], offset); w[35] = amd_bytealign (w[ 5], w[ 6], offset); w[34] = amd_bytealign (w[ 4], w[ 5], offset); w[33] = amd_bytealign (w[ 3], w[ 4], offset); w[32] = amd_bytealign (w[ 2], w[ 3], offset); w[31] = amd_bytealign (w[ 1], w[ 2], offset); w[30] = amd_bytealign (w[ 0], w[ 1], offset); w[29] = amd_bytealign ( 0, w[ 0], offset); w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 30: w[63] = amd_bytealign (w[32], w[33], offset); w[62] = amd_bytealign (w[31], w[32], offset); w[61] = amd_bytealign (w[30], w[31], offset); w[60] = amd_bytealign (w[29], w[30], offset); w[59] = amd_bytealign (w[28], w[29], offset); w[58] = amd_bytealign (w[27], w[28], offset); w[57] = amd_bytealign (w[26], w[27], offset); w[56] = amd_bytealign (w[25], w[26], offset); w[55] = amd_bytealign (w[24], w[25], offset); w[54] = amd_bytealign (w[23], w[24], offset); w[53] = amd_bytealign (w[22], w[23], offset); w[52] = amd_bytealign (w[21], w[22], offset); w[51] = amd_bytealign (w[20], w[21], offset); w[50] = amd_bytealign (w[19], w[20], offset); w[49] = amd_bytealign (w[18], w[19], offset); w[48] = amd_bytealign (w[17], w[18], offset); w[47] = amd_bytealign (w[16], w[17], offset); w[46] = amd_bytealign (w[15], w[16], offset); w[45] = amd_bytealign (w[14], w[15], offset); w[44] = amd_bytealign (w[13], w[14], offset); w[43] = amd_bytealign (w[12], w[13], offset); w[42] = amd_bytealign (w[11], w[12], offset); w[41] = amd_bytealign (w[10], w[11], offset); w[40] = amd_bytealign (w[ 9], w[10], offset); w[39] = amd_bytealign (w[ 8], w[ 9], offset); w[38] = amd_bytealign (w[ 7], w[ 8], offset); w[37] = amd_bytealign (w[ 6], w[ 7], offset); w[36] = amd_bytealign (w[ 5], w[ 6], offset); w[35] = amd_bytealign (w[ 4], w[ 5], offset); w[34] = amd_bytealign (w[ 3], w[ 4], offset); w[33] = amd_bytealign (w[ 2], w[ 3], offset); w[32] = amd_bytealign (w[ 1], w[ 2], offset); w[31] = amd_bytealign (w[ 0], w[ 1], offset); w[30] = amd_bytealign ( 0, w[ 0], offset); w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 31: w[63] = amd_bytealign (w[31], w[32], offset); w[62] = amd_bytealign (w[30], w[31], offset); w[61] = amd_bytealign (w[29], w[30], offset); w[60] = amd_bytealign (w[28], w[29], offset); w[59] = amd_bytealign (w[27], w[28], offset); w[58] = amd_bytealign (w[26], w[27], offset); w[57] = amd_bytealign (w[25], w[26], offset); w[56] = amd_bytealign (w[24], w[25], offset); w[55] = amd_bytealign (w[23], w[24], offset); w[54] = amd_bytealign (w[22], w[23], offset); w[53] = amd_bytealign (w[21], w[22], offset); w[52] = amd_bytealign (w[20], w[21], offset); w[51] = amd_bytealign (w[19], w[20], offset); w[50] = amd_bytealign (w[18], w[19], offset); w[49] = amd_bytealign (w[17], w[18], offset); w[48] = amd_bytealign (w[16], w[17], offset); w[47] = amd_bytealign (w[15], w[16], offset); w[46] = amd_bytealign (w[14], w[15], offset); w[45] = amd_bytealign (w[13], w[14], offset); w[44] = amd_bytealign (w[12], w[13], offset); w[43] = amd_bytealign (w[11], w[12], offset); w[42] = amd_bytealign (w[10], w[11], offset); w[41] = amd_bytealign (w[ 9], w[10], offset); w[40] = amd_bytealign (w[ 8], w[ 9], offset); w[39] = amd_bytealign (w[ 7], w[ 8], offset); w[38] = amd_bytealign (w[ 6], w[ 7], offset); w[37] = amd_bytealign (w[ 5], w[ 6], offset); w[36] = amd_bytealign (w[ 4], w[ 5], offset); w[35] = amd_bytealign (w[ 3], w[ 4], offset); w[34] = amd_bytealign (w[ 2], w[ 3], offset); w[33] = amd_bytealign (w[ 1], w[ 2], offset); w[32] = amd_bytealign (w[ 0], w[ 1], offset); w[31] = amd_bytealign ( 0, w[ 0], offset); w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 32: w[63] = amd_bytealign (w[30], w[31], offset); w[62] = amd_bytealign (w[29], w[30], offset); w[61] = amd_bytealign (w[28], w[29], offset); w[60] = amd_bytealign (w[27], w[28], offset); w[59] = amd_bytealign (w[26], w[27], offset); w[58] = amd_bytealign (w[25], w[26], offset); w[57] = amd_bytealign (w[24], w[25], offset); w[56] = amd_bytealign (w[23], w[24], offset); w[55] = amd_bytealign (w[22], w[23], offset); w[54] = amd_bytealign (w[21], w[22], offset); w[53] = amd_bytealign (w[20], w[21], offset); w[52] = amd_bytealign (w[19], w[20], offset); w[51] = amd_bytealign (w[18], w[19], offset); w[50] = amd_bytealign (w[17], w[18], offset); w[49] = amd_bytealign (w[16], w[17], offset); w[48] = amd_bytealign (w[15], w[16], offset); w[47] = amd_bytealign (w[14], w[15], offset); w[46] = amd_bytealign (w[13], w[14], offset); w[45] = amd_bytealign (w[12], w[13], offset); w[44] = amd_bytealign (w[11], w[12], offset); w[43] = amd_bytealign (w[10], w[11], offset); w[42] = amd_bytealign (w[ 9], w[10], offset); w[41] = amd_bytealign (w[ 8], w[ 9], offset); w[40] = amd_bytealign (w[ 7], w[ 8], offset); w[39] = amd_bytealign (w[ 6], w[ 7], offset); w[38] = amd_bytealign (w[ 5], w[ 6], offset); w[37] = amd_bytealign (w[ 4], w[ 5], offset); w[36] = amd_bytealign (w[ 3], w[ 4], offset); w[35] = amd_bytealign (w[ 2], w[ 3], offset); w[34] = amd_bytealign (w[ 1], w[ 2], offset); w[33] = amd_bytealign (w[ 0], w[ 1], offset); w[32] = amd_bytealign ( 0, w[ 0], offset); w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 33: w[63] = amd_bytealign (w[29], w[30], offset); w[62] = amd_bytealign (w[28], w[29], offset); w[61] = amd_bytealign (w[27], w[28], offset); w[60] = amd_bytealign (w[26], w[27], offset); w[59] = amd_bytealign (w[25], w[26], offset); w[58] = amd_bytealign (w[24], w[25], offset); w[57] = amd_bytealign (w[23], w[24], offset); w[56] = amd_bytealign (w[22], w[23], offset); w[55] = amd_bytealign (w[21], w[22], offset); w[54] = amd_bytealign (w[20], w[21], offset); w[53] = amd_bytealign (w[19], w[20], offset); w[52] = amd_bytealign (w[18], w[19], offset); w[51] = amd_bytealign (w[17], w[18], offset); w[50] = amd_bytealign (w[16], w[17], offset); w[49] = amd_bytealign (w[15], w[16], offset); w[48] = amd_bytealign (w[14], w[15], offset); w[47] = amd_bytealign (w[13], w[14], offset); w[46] = amd_bytealign (w[12], w[13], offset); w[45] = amd_bytealign (w[11], w[12], offset); w[44] = amd_bytealign (w[10], w[11], offset); w[43] = amd_bytealign (w[ 9], w[10], offset); w[42] = amd_bytealign (w[ 8], w[ 9], offset); w[41] = amd_bytealign (w[ 7], w[ 8], offset); w[40] = amd_bytealign (w[ 6], w[ 7], offset); w[39] = amd_bytealign (w[ 5], w[ 6], offset); w[38] = amd_bytealign (w[ 4], w[ 5], offset); w[37] = amd_bytealign (w[ 3], w[ 4], offset); w[36] = amd_bytealign (w[ 2], w[ 3], offset); w[35] = amd_bytealign (w[ 1], w[ 2], offset); w[34] = amd_bytealign (w[ 0], w[ 1], offset); w[33] = amd_bytealign ( 0, w[ 0], offset); w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 34: w[63] = amd_bytealign (w[28], w[29], offset); w[62] = amd_bytealign (w[27], w[28], offset); w[61] = amd_bytealign (w[26], w[27], offset); w[60] = amd_bytealign (w[25], w[26], offset); w[59] = amd_bytealign (w[24], w[25], offset); w[58] = amd_bytealign (w[23], w[24], offset); w[57] = amd_bytealign (w[22], w[23], offset); w[56] = amd_bytealign (w[21], w[22], offset); w[55] = amd_bytealign (w[20], w[21], offset); w[54] = amd_bytealign (w[19], w[20], offset); w[53] = amd_bytealign (w[18], w[19], offset); w[52] = amd_bytealign (w[17], w[18], offset); w[51] = amd_bytealign (w[16], w[17], offset); w[50] = amd_bytealign (w[15], w[16], offset); w[49] = amd_bytealign (w[14], w[15], offset); w[48] = amd_bytealign (w[13], w[14], offset); w[47] = amd_bytealign (w[12], w[13], offset); w[46] = amd_bytealign (w[11], w[12], offset); w[45] = amd_bytealign (w[10], w[11], offset); w[44] = amd_bytealign (w[ 9], w[10], offset); w[43] = amd_bytealign (w[ 8], w[ 9], offset); w[42] = amd_bytealign (w[ 7], w[ 8], offset); w[41] = amd_bytealign (w[ 6], w[ 7], offset); w[40] = amd_bytealign (w[ 5], w[ 6], offset); w[39] = amd_bytealign (w[ 4], w[ 5], offset); w[38] = amd_bytealign (w[ 3], w[ 4], offset); w[37] = amd_bytealign (w[ 2], w[ 3], offset); w[36] = amd_bytealign (w[ 1], w[ 2], offset); w[35] = amd_bytealign (w[ 0], w[ 1], offset); w[34] = amd_bytealign ( 0, w[ 0], offset); w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 35: w[63] = amd_bytealign (w[27], w[28], offset); w[62] = amd_bytealign (w[26], w[27], offset); w[61] = amd_bytealign (w[25], w[26], offset); w[60] = amd_bytealign (w[24], w[25], offset); w[59] = amd_bytealign (w[23], w[24], offset); w[58] = amd_bytealign (w[22], w[23], offset); w[57] = amd_bytealign (w[21], w[22], offset); w[56] = amd_bytealign (w[20], w[21], offset); w[55] = amd_bytealign (w[19], w[20], offset); w[54] = amd_bytealign (w[18], w[19], offset); w[53] = amd_bytealign (w[17], w[18], offset); w[52] = amd_bytealign (w[16], w[17], offset); w[51] = amd_bytealign (w[15], w[16], offset); w[50] = amd_bytealign (w[14], w[15], offset); w[49] = amd_bytealign (w[13], w[14], offset); w[48] = amd_bytealign (w[12], w[13], offset); w[47] = amd_bytealign (w[11], w[12], offset); w[46] = amd_bytealign (w[10], w[11], offset); w[45] = amd_bytealign (w[ 9], w[10], offset); w[44] = amd_bytealign (w[ 8], w[ 9], offset); w[43] = amd_bytealign (w[ 7], w[ 8], offset); w[42] = amd_bytealign (w[ 6], w[ 7], offset); w[41] = amd_bytealign (w[ 5], w[ 6], offset); w[40] = amd_bytealign (w[ 4], w[ 5], offset); w[39] = amd_bytealign (w[ 3], w[ 4], offset); w[38] = amd_bytealign (w[ 2], w[ 3], offset); w[37] = amd_bytealign (w[ 1], w[ 2], offset); w[36] = amd_bytealign (w[ 0], w[ 1], offset); w[35] = amd_bytealign ( 0, w[ 0], offset); w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 36: w[63] = amd_bytealign (w[26], w[27], offset); w[62] = amd_bytealign (w[25], w[26], offset); w[61] = amd_bytealign (w[24], w[25], offset); w[60] = amd_bytealign (w[23], w[24], offset); w[59] = amd_bytealign (w[22], w[23], offset); w[58] = amd_bytealign (w[21], w[22], offset); w[57] = amd_bytealign (w[20], w[21], offset); w[56] = amd_bytealign (w[19], w[20], offset); w[55] = amd_bytealign (w[18], w[19], offset); w[54] = amd_bytealign (w[17], w[18], offset); w[53] = amd_bytealign (w[16], w[17], offset); w[52] = amd_bytealign (w[15], w[16], offset); w[51] = amd_bytealign (w[14], w[15], offset); w[50] = amd_bytealign (w[13], w[14], offset); w[49] = amd_bytealign (w[12], w[13], offset); w[48] = amd_bytealign (w[11], w[12], offset); w[47] = amd_bytealign (w[10], w[11], offset); w[46] = amd_bytealign (w[ 9], w[10], offset); w[45] = amd_bytealign (w[ 8], w[ 9], offset); w[44] = amd_bytealign (w[ 7], w[ 8], offset); w[43] = amd_bytealign (w[ 6], w[ 7], offset); w[42] = amd_bytealign (w[ 5], w[ 6], offset); w[41] = amd_bytealign (w[ 4], w[ 5], offset); w[40] = amd_bytealign (w[ 3], w[ 4], offset); w[39] = amd_bytealign (w[ 2], w[ 3], offset); w[38] = amd_bytealign (w[ 1], w[ 2], offset); w[37] = amd_bytealign (w[ 0], w[ 1], offset); w[36] = amd_bytealign ( 0, w[ 0], offset); w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 37: w[63] = amd_bytealign (w[25], w[26], offset); w[62] = amd_bytealign (w[24], w[25], offset); w[61] = amd_bytealign (w[23], w[24], offset); w[60] = amd_bytealign (w[22], w[23], offset); w[59] = amd_bytealign (w[21], w[22], offset); w[58] = amd_bytealign (w[20], w[21], offset); w[57] = amd_bytealign (w[19], w[20], offset); w[56] = amd_bytealign (w[18], w[19], offset); w[55] = amd_bytealign (w[17], w[18], offset); w[54] = amd_bytealign (w[16], w[17], offset); w[53] = amd_bytealign (w[15], w[16], offset); w[52] = amd_bytealign (w[14], w[15], offset); w[51] = amd_bytealign (w[13], w[14], offset); w[50] = amd_bytealign (w[12], w[13], offset); w[49] = amd_bytealign (w[11], w[12], offset); w[48] = amd_bytealign (w[10], w[11], offset); w[47] = amd_bytealign (w[ 9], w[10], offset); w[46] = amd_bytealign (w[ 8], w[ 9], offset); w[45] = amd_bytealign (w[ 7], w[ 8], offset); w[44] = amd_bytealign (w[ 6], w[ 7], offset); w[43] = amd_bytealign (w[ 5], w[ 6], offset); w[42] = amd_bytealign (w[ 4], w[ 5], offset); w[41] = amd_bytealign (w[ 3], w[ 4], offset); w[40] = amd_bytealign (w[ 2], w[ 3], offset); w[39] = amd_bytealign (w[ 1], w[ 2], offset); w[38] = amd_bytealign (w[ 0], w[ 1], offset); w[37] = amd_bytealign ( 0, w[ 0], offset); w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 38: w[63] = amd_bytealign (w[24], w[25], offset); w[62] = amd_bytealign (w[23], w[24], offset); w[61] = amd_bytealign (w[22], w[23], offset); w[60] = amd_bytealign (w[21], w[22], offset); w[59] = amd_bytealign (w[20], w[21], offset); w[58] = amd_bytealign (w[19], w[20], offset); w[57] = amd_bytealign (w[18], w[19], offset); w[56] = amd_bytealign (w[17], w[18], offset); w[55] = amd_bytealign (w[16], w[17], offset); w[54] = amd_bytealign (w[15], w[16], offset); w[53] = amd_bytealign (w[14], w[15], offset); w[52] = amd_bytealign (w[13], w[14], offset); w[51] = amd_bytealign (w[12], w[13], offset); w[50] = amd_bytealign (w[11], w[12], offset); w[49] = amd_bytealign (w[10], w[11], offset); w[48] = amd_bytealign (w[ 9], w[10], offset); w[47] = amd_bytealign (w[ 8], w[ 9], offset); w[46] = amd_bytealign (w[ 7], w[ 8], offset); w[45] = amd_bytealign (w[ 6], w[ 7], offset); w[44] = amd_bytealign (w[ 5], w[ 6], offset); w[43] = amd_bytealign (w[ 4], w[ 5], offset); w[42] = amd_bytealign (w[ 3], w[ 4], offset); w[41] = amd_bytealign (w[ 2], w[ 3], offset); w[40] = amd_bytealign (w[ 1], w[ 2], offset); w[39] = amd_bytealign (w[ 0], w[ 1], offset); w[38] = amd_bytealign ( 0, w[ 0], offset); w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 39: w[63] = amd_bytealign (w[23], w[24], offset); w[62] = amd_bytealign (w[22], w[23], offset); w[61] = amd_bytealign (w[21], w[22], offset); w[60] = amd_bytealign (w[20], w[21], offset); w[59] = amd_bytealign (w[19], w[20], offset); w[58] = amd_bytealign (w[18], w[19], offset); w[57] = amd_bytealign (w[17], w[18], offset); w[56] = amd_bytealign (w[16], w[17], offset); w[55] = amd_bytealign (w[15], w[16], offset); w[54] = amd_bytealign (w[14], w[15], offset); w[53] = amd_bytealign (w[13], w[14], offset); w[52] = amd_bytealign (w[12], w[13], offset); w[51] = amd_bytealign (w[11], w[12], offset); w[50] = amd_bytealign (w[10], w[11], offset); w[49] = amd_bytealign (w[ 9], w[10], offset); w[48] = amd_bytealign (w[ 8], w[ 9], offset); w[47] = amd_bytealign (w[ 7], w[ 8], offset); w[46] = amd_bytealign (w[ 6], w[ 7], offset); w[45] = amd_bytealign (w[ 5], w[ 6], offset); w[44] = amd_bytealign (w[ 4], w[ 5], offset); w[43] = amd_bytealign (w[ 3], w[ 4], offset); w[42] = amd_bytealign (w[ 2], w[ 3], offset); w[41] = amd_bytealign (w[ 1], w[ 2], offset); w[40] = amd_bytealign (w[ 0], w[ 1], offset); w[39] = amd_bytealign ( 0, w[ 0], offset); w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 40: w[63] = amd_bytealign (w[22], w[23], offset); w[62] = amd_bytealign (w[21], w[22], offset); w[61] = amd_bytealign (w[20], w[21], offset); w[60] = amd_bytealign (w[19], w[20], offset); w[59] = amd_bytealign (w[18], w[19], offset); w[58] = amd_bytealign (w[17], w[18], offset); w[57] = amd_bytealign (w[16], w[17], offset); w[56] = amd_bytealign (w[15], w[16], offset); w[55] = amd_bytealign (w[14], w[15], offset); w[54] = amd_bytealign (w[13], w[14], offset); w[53] = amd_bytealign (w[12], w[13], offset); w[52] = amd_bytealign (w[11], w[12], offset); w[51] = amd_bytealign (w[10], w[11], offset); w[50] = amd_bytealign (w[ 9], w[10], offset); w[49] = amd_bytealign (w[ 8], w[ 9], offset); w[48] = amd_bytealign (w[ 7], w[ 8], offset); w[47] = amd_bytealign (w[ 6], w[ 7], offset); w[46] = amd_bytealign (w[ 5], w[ 6], offset); w[45] = amd_bytealign (w[ 4], w[ 5], offset); w[44] = amd_bytealign (w[ 3], w[ 4], offset); w[43] = amd_bytealign (w[ 2], w[ 3], offset); w[42] = amd_bytealign (w[ 1], w[ 2], offset); w[41] = amd_bytealign (w[ 0], w[ 1], offset); w[40] = amd_bytealign ( 0, w[ 0], offset); w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 41: w[63] = amd_bytealign (w[21], w[22], offset); w[62] = amd_bytealign (w[20], w[21], offset); w[61] = amd_bytealign (w[19], w[20], offset); w[60] = amd_bytealign (w[18], w[19], offset); w[59] = amd_bytealign (w[17], w[18], offset); w[58] = amd_bytealign (w[16], w[17], offset); w[57] = amd_bytealign (w[15], w[16], offset); w[56] = amd_bytealign (w[14], w[15], offset); w[55] = amd_bytealign (w[13], w[14], offset); w[54] = amd_bytealign (w[12], w[13], offset); w[53] = amd_bytealign (w[11], w[12], offset); w[52] = amd_bytealign (w[10], w[11], offset); w[51] = amd_bytealign (w[ 9], w[10], offset); w[50] = amd_bytealign (w[ 8], w[ 9], offset); w[49] = amd_bytealign (w[ 7], w[ 8], offset); w[48] = amd_bytealign (w[ 6], w[ 7], offset); w[47] = amd_bytealign (w[ 5], w[ 6], offset); w[46] = amd_bytealign (w[ 4], w[ 5], offset); w[45] = amd_bytealign (w[ 3], w[ 4], offset); w[44] = amd_bytealign (w[ 2], w[ 3], offset); w[43] = amd_bytealign (w[ 1], w[ 2], offset); w[42] = amd_bytealign (w[ 0], w[ 1], offset); w[41] = amd_bytealign ( 0, w[ 0], offset); w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 42: w[63] = amd_bytealign (w[20], w[21], offset); w[62] = amd_bytealign (w[19], w[20], offset); w[61] = amd_bytealign (w[18], w[19], offset); w[60] = amd_bytealign (w[17], w[18], offset); w[59] = amd_bytealign (w[16], w[17], offset); w[58] = amd_bytealign (w[15], w[16], offset); w[57] = amd_bytealign (w[14], w[15], offset); w[56] = amd_bytealign (w[13], w[14], offset); w[55] = amd_bytealign (w[12], w[13], offset); w[54] = amd_bytealign (w[11], w[12], offset); w[53] = amd_bytealign (w[10], w[11], offset); w[52] = amd_bytealign (w[ 9], w[10], offset); w[51] = amd_bytealign (w[ 8], w[ 9], offset); w[50] = amd_bytealign (w[ 7], w[ 8], offset); w[49] = amd_bytealign (w[ 6], w[ 7], offset); w[48] = amd_bytealign (w[ 5], w[ 6], offset); w[47] = amd_bytealign (w[ 4], w[ 5], offset); w[46] = amd_bytealign (w[ 3], w[ 4], offset); w[45] = amd_bytealign (w[ 2], w[ 3], offset); w[44] = amd_bytealign (w[ 1], w[ 2], offset); w[43] = amd_bytealign (w[ 0], w[ 1], offset); w[42] = amd_bytealign ( 0, w[ 0], offset); w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 43: w[63] = amd_bytealign (w[19], w[20], offset); w[62] = amd_bytealign (w[18], w[19], offset); w[61] = amd_bytealign (w[17], w[18], offset); w[60] = amd_bytealign (w[16], w[17], offset); w[59] = amd_bytealign (w[15], w[16], offset); w[58] = amd_bytealign (w[14], w[15], offset); w[57] = amd_bytealign (w[13], w[14], offset); w[56] = amd_bytealign (w[12], w[13], offset); w[55] = amd_bytealign (w[11], w[12], offset); w[54] = amd_bytealign (w[10], w[11], offset); w[53] = amd_bytealign (w[ 9], w[10], offset); w[52] = amd_bytealign (w[ 8], w[ 9], offset); w[51] = amd_bytealign (w[ 7], w[ 8], offset); w[50] = amd_bytealign (w[ 6], w[ 7], offset); w[49] = amd_bytealign (w[ 5], w[ 6], offset); w[48] = amd_bytealign (w[ 4], w[ 5], offset); w[47] = amd_bytealign (w[ 3], w[ 4], offset); w[46] = amd_bytealign (w[ 2], w[ 3], offset); w[45] = amd_bytealign (w[ 1], w[ 2], offset); w[44] = amd_bytealign (w[ 0], w[ 1], offset); w[43] = amd_bytealign ( 0, w[ 0], offset); w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 44: w[63] = amd_bytealign (w[18], w[19], offset); w[62] = amd_bytealign (w[17], w[18], offset); w[61] = amd_bytealign (w[16], w[17], offset); w[60] = amd_bytealign (w[15], w[16], offset); w[59] = amd_bytealign (w[14], w[15], offset); w[58] = amd_bytealign (w[13], w[14], offset); w[57] = amd_bytealign (w[12], w[13], offset); w[56] = amd_bytealign (w[11], w[12], offset); w[55] = amd_bytealign (w[10], w[11], offset); w[54] = amd_bytealign (w[ 9], w[10], offset); w[53] = amd_bytealign (w[ 8], w[ 9], offset); w[52] = amd_bytealign (w[ 7], w[ 8], offset); w[51] = amd_bytealign (w[ 6], w[ 7], offset); w[50] = amd_bytealign (w[ 5], w[ 6], offset); w[49] = amd_bytealign (w[ 4], w[ 5], offset); w[48] = amd_bytealign (w[ 3], w[ 4], offset); w[47] = amd_bytealign (w[ 2], w[ 3], offset); w[46] = amd_bytealign (w[ 1], w[ 2], offset); w[45] = amd_bytealign (w[ 0], w[ 1], offset); w[44] = amd_bytealign ( 0, w[ 0], offset); w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 45: w[63] = amd_bytealign (w[17], w[18], offset); w[62] = amd_bytealign (w[16], w[17], offset); w[61] = amd_bytealign (w[15], w[16], offset); w[60] = amd_bytealign (w[14], w[15], offset); w[59] = amd_bytealign (w[13], w[14], offset); w[58] = amd_bytealign (w[12], w[13], offset); w[57] = amd_bytealign (w[11], w[12], offset); w[56] = amd_bytealign (w[10], w[11], offset); w[55] = amd_bytealign (w[ 9], w[10], offset); w[54] = amd_bytealign (w[ 8], w[ 9], offset); w[53] = amd_bytealign (w[ 7], w[ 8], offset); w[52] = amd_bytealign (w[ 6], w[ 7], offset); w[51] = amd_bytealign (w[ 5], w[ 6], offset); w[50] = amd_bytealign (w[ 4], w[ 5], offset); w[49] = amd_bytealign (w[ 3], w[ 4], offset); w[48] = amd_bytealign (w[ 2], w[ 3], offset); w[47] = amd_bytealign (w[ 1], w[ 2], offset); w[46] = amd_bytealign (w[ 0], w[ 1], offset); w[45] = amd_bytealign ( 0, w[ 0], offset); w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 46: w[63] = amd_bytealign (w[16], w[17], offset); w[62] = amd_bytealign (w[15], w[16], offset); w[61] = amd_bytealign (w[14], w[15], offset); w[60] = amd_bytealign (w[13], w[14], offset); w[59] = amd_bytealign (w[12], w[13], offset); w[58] = amd_bytealign (w[11], w[12], offset); w[57] = amd_bytealign (w[10], w[11], offset); w[56] = amd_bytealign (w[ 9], w[10], offset); w[55] = amd_bytealign (w[ 8], w[ 9], offset); w[54] = amd_bytealign (w[ 7], w[ 8], offset); w[53] = amd_bytealign (w[ 6], w[ 7], offset); w[52] = amd_bytealign (w[ 5], w[ 6], offset); w[51] = amd_bytealign (w[ 4], w[ 5], offset); w[50] = amd_bytealign (w[ 3], w[ 4], offset); w[49] = amd_bytealign (w[ 2], w[ 3], offset); w[48] = amd_bytealign (w[ 1], w[ 2], offset); w[47] = amd_bytealign (w[ 0], w[ 1], offset); w[46] = amd_bytealign ( 0, w[ 0], offset); w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 47: w[63] = amd_bytealign (w[15], w[16], offset); w[62] = amd_bytealign (w[14], w[15], offset); w[61] = amd_bytealign (w[13], w[14], offset); w[60] = amd_bytealign (w[12], w[13], offset); w[59] = amd_bytealign (w[11], w[12], offset); w[58] = amd_bytealign (w[10], w[11], offset); w[57] = amd_bytealign (w[ 9], w[10], offset); w[56] = amd_bytealign (w[ 8], w[ 9], offset); w[55] = amd_bytealign (w[ 7], w[ 8], offset); w[54] = amd_bytealign (w[ 6], w[ 7], offset); w[53] = amd_bytealign (w[ 5], w[ 6], offset); w[52] = amd_bytealign (w[ 4], w[ 5], offset); w[51] = amd_bytealign (w[ 3], w[ 4], offset); w[50] = amd_bytealign (w[ 2], w[ 3], offset); w[49] = amd_bytealign (w[ 1], w[ 2], offset); w[48] = amd_bytealign (w[ 0], w[ 1], offset); w[47] = amd_bytealign ( 0, w[ 0], offset); w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 48: w[63] = amd_bytealign (w[14], w[15], offset); w[62] = amd_bytealign (w[13], w[14], offset); w[61] = amd_bytealign (w[12], w[13], offset); w[60] = amd_bytealign (w[11], w[12], offset); w[59] = amd_bytealign (w[10], w[11], offset); w[58] = amd_bytealign (w[ 9], w[10], offset); w[57] = amd_bytealign (w[ 8], w[ 9], offset); w[56] = amd_bytealign (w[ 7], w[ 8], offset); w[55] = amd_bytealign (w[ 6], w[ 7], offset); w[54] = amd_bytealign (w[ 5], w[ 6], offset); w[53] = amd_bytealign (w[ 4], w[ 5], offset); w[52] = amd_bytealign (w[ 3], w[ 4], offset); w[51] = amd_bytealign (w[ 2], w[ 3], offset); w[50] = amd_bytealign (w[ 1], w[ 2], offset); w[49] = amd_bytealign (w[ 0], w[ 1], offset); w[48] = amd_bytealign ( 0, w[ 0], offset); w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 49: w[63] = amd_bytealign (w[13], w[14], offset); w[62] = amd_bytealign (w[12], w[13], offset); w[61] = amd_bytealign (w[11], w[12], offset); w[60] = amd_bytealign (w[10], w[11], offset); w[59] = amd_bytealign (w[ 9], w[10], offset); w[58] = amd_bytealign (w[ 8], w[ 9], offset); w[57] = amd_bytealign (w[ 7], w[ 8], offset); w[56] = amd_bytealign (w[ 6], w[ 7], offset); w[55] = amd_bytealign (w[ 5], w[ 6], offset); w[54] = amd_bytealign (w[ 4], w[ 5], offset); w[53] = amd_bytealign (w[ 3], w[ 4], offset); w[52] = amd_bytealign (w[ 2], w[ 3], offset); w[51] = amd_bytealign (w[ 1], w[ 2], offset); w[50] = amd_bytealign (w[ 0], w[ 1], offset); w[49] = amd_bytealign ( 0, w[ 0], offset); w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 50: w[63] = amd_bytealign (w[12], w[13], offset); w[62] = amd_bytealign (w[11], w[12], offset); w[61] = amd_bytealign (w[10], w[11], offset); w[60] = amd_bytealign (w[ 9], w[10], offset); w[59] = amd_bytealign (w[ 8], w[ 9], offset); w[58] = amd_bytealign (w[ 7], w[ 8], offset); w[57] = amd_bytealign (w[ 6], w[ 7], offset); w[56] = amd_bytealign (w[ 5], w[ 6], offset); w[55] = amd_bytealign (w[ 4], w[ 5], offset); w[54] = amd_bytealign (w[ 3], w[ 4], offset); w[53] = amd_bytealign (w[ 2], w[ 3], offset); w[52] = amd_bytealign (w[ 1], w[ 2], offset); w[51] = amd_bytealign (w[ 0], w[ 1], offset); w[50] = amd_bytealign ( 0, w[ 0], offset); w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 51: w[63] = amd_bytealign (w[11], w[12], offset); w[62] = amd_bytealign (w[10], w[11], offset); w[61] = amd_bytealign (w[ 9], w[10], offset); w[60] = amd_bytealign (w[ 8], w[ 9], offset); w[59] = amd_bytealign (w[ 7], w[ 8], offset); w[58] = amd_bytealign (w[ 6], w[ 7], offset); w[57] = amd_bytealign (w[ 5], w[ 6], offset); w[56] = amd_bytealign (w[ 4], w[ 5], offset); w[55] = amd_bytealign (w[ 3], w[ 4], offset); w[54] = amd_bytealign (w[ 2], w[ 3], offset); w[53] = amd_bytealign (w[ 1], w[ 2], offset); w[52] = amd_bytealign (w[ 0], w[ 1], offset); w[51] = amd_bytealign ( 0, w[ 0], offset); w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 52: w[63] = amd_bytealign (w[10], w[11], offset); w[62] = amd_bytealign (w[ 9], w[10], offset); w[61] = amd_bytealign (w[ 8], w[ 9], offset); w[60] = amd_bytealign (w[ 7], w[ 8], offset); w[59] = amd_bytealign (w[ 6], w[ 7], offset); w[58] = amd_bytealign (w[ 5], w[ 6], offset); w[57] = amd_bytealign (w[ 4], w[ 5], offset); w[56] = amd_bytealign (w[ 3], w[ 4], offset); w[55] = amd_bytealign (w[ 2], w[ 3], offset); w[54] = amd_bytealign (w[ 1], w[ 2], offset); w[53] = amd_bytealign (w[ 0], w[ 1], offset); w[52] = amd_bytealign ( 0, w[ 0], offset); w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 53: w[63] = amd_bytealign (w[ 9], w[10], offset); w[62] = amd_bytealign (w[ 8], w[ 9], offset); w[61] = amd_bytealign (w[ 7], w[ 8], offset); w[60] = amd_bytealign (w[ 6], w[ 7], offset); w[59] = amd_bytealign (w[ 5], w[ 6], offset); w[58] = amd_bytealign (w[ 4], w[ 5], offset); w[57] = amd_bytealign (w[ 3], w[ 4], offset); w[56] = amd_bytealign (w[ 2], w[ 3], offset); w[55] = amd_bytealign (w[ 1], w[ 2], offset); w[54] = amd_bytealign (w[ 0], w[ 1], offset); w[53] = amd_bytealign ( 0, w[ 0], offset); w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 54: w[63] = amd_bytealign (w[ 8], w[ 9], offset); w[62] = amd_bytealign (w[ 7], w[ 8], offset); w[61] = amd_bytealign (w[ 6], w[ 7], offset); w[60] = amd_bytealign (w[ 5], w[ 6], offset); w[59] = amd_bytealign (w[ 4], w[ 5], offset); w[58] = amd_bytealign (w[ 3], w[ 4], offset); w[57] = amd_bytealign (w[ 2], w[ 3], offset); w[56] = amd_bytealign (w[ 1], w[ 2], offset); w[55] = amd_bytealign (w[ 0], w[ 1], offset); w[54] = amd_bytealign ( 0, w[ 0], offset); w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 55: w[63] = amd_bytealign (w[ 7], w[ 8], offset); w[62] = amd_bytealign (w[ 6], w[ 7], offset); w[61] = amd_bytealign (w[ 5], w[ 6], offset); w[60] = amd_bytealign (w[ 4], w[ 5], offset); w[59] = amd_bytealign (w[ 3], w[ 4], offset); w[58] = amd_bytealign (w[ 2], w[ 3], offset); w[57] = amd_bytealign (w[ 1], w[ 2], offset); w[56] = amd_bytealign (w[ 0], w[ 1], offset); w[55] = amd_bytealign ( 0, w[ 0], offset); w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 56: w[63] = amd_bytealign (w[ 6], w[ 7], offset); w[62] = amd_bytealign (w[ 5], w[ 6], offset); w[61] = amd_bytealign (w[ 4], w[ 5], offset); w[60] = amd_bytealign (w[ 3], w[ 4], offset); w[59] = amd_bytealign (w[ 2], w[ 3], offset); w[58] = amd_bytealign (w[ 1], w[ 2], offset); w[57] = amd_bytealign (w[ 0], w[ 1], offset); w[56] = amd_bytealign ( 0, w[ 0], offset); w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 57: w[63] = amd_bytealign (w[ 5], w[ 6], offset); w[62] = amd_bytealign (w[ 4], w[ 5], offset); w[61] = amd_bytealign (w[ 3], w[ 4], offset); w[60] = amd_bytealign (w[ 2], w[ 3], offset); w[59] = amd_bytealign (w[ 1], w[ 2], offset); w[58] = amd_bytealign (w[ 0], w[ 1], offset); w[57] = amd_bytealign ( 0, w[ 0], offset); w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 58: w[63] = amd_bytealign (w[ 4], w[ 5], offset); w[62] = amd_bytealign (w[ 3], w[ 4], offset); w[61] = amd_bytealign (w[ 2], w[ 3], offset); w[60] = amd_bytealign (w[ 1], w[ 2], offset); w[59] = amd_bytealign (w[ 0], w[ 1], offset); w[58] = amd_bytealign ( 0, w[ 0], offset); w[57] = 0; w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 59: w[63] = amd_bytealign (w[ 3], w[ 4], offset); w[62] = amd_bytealign (w[ 2], w[ 3], offset); w[61] = amd_bytealign (w[ 1], w[ 2], offset); w[60] = amd_bytealign (w[ 0], w[ 1], offset); w[59] = amd_bytealign ( 0, w[ 0], offset); w[58] = 0; w[57] = 0; w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 60: w[63] = amd_bytealign (w[ 2], w[ 3], offset); w[62] = amd_bytealign (w[ 1], w[ 2], offset); w[61] = amd_bytealign (w[ 0], w[ 1], offset); w[60] = amd_bytealign ( 0, w[ 0], offset); w[59] = 0; w[58] = 0; w[57] = 0; w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 61: w[63] = amd_bytealign (w[ 1], w[ 2], offset); w[62] = amd_bytealign (w[ 0], w[ 1], offset); w[61] = amd_bytealign ( 0, w[ 0], offset); w[60] = 0; w[59] = 0; w[58] = 0; w[57] = 0; w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 62: w[63] = amd_bytealign (w[ 0], w[ 1], offset); w[62] = amd_bytealign ( 0, w[ 0], offset); w[61] = 0; w[60] = 0; w[59] = 0; w[58] = 0; w[57] = 0; w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 63: w[63] = amd_bytealign ( 0, w[ 0], offset); w[62] = 0; w[61] = 0; w[60] = 0; w[59] = 0; w[58] = 0; w[57] = 0; w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; } #pragma unroll for (int i = 0; i < 64; i++) w[i] = swap32 (w[i]); #endif #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV #if defined IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; #endif #if defined IS_AMD const int selector = 0x0706050403020100 >> (offset_minus_4 * 8); #endif switch (offset_switch) { case 0: w[63] = __byte_perm (w[62], w[63], selector); w[62] = __byte_perm (w[61], w[62], selector); w[61] = __byte_perm (w[60], w[61], selector); w[60] = __byte_perm (w[59], w[60], selector); w[59] = __byte_perm (w[58], w[59], selector); w[58] = __byte_perm (w[57], w[58], selector); w[57] = __byte_perm (w[56], w[57], selector); w[56] = __byte_perm (w[55], w[56], selector); w[55] = __byte_perm (w[54], w[55], selector); w[54] = __byte_perm (w[53], w[54], selector); w[53] = __byte_perm (w[52], w[53], selector); w[52] = __byte_perm (w[51], w[52], selector); w[51] = __byte_perm (w[50], w[51], selector); w[50] = __byte_perm (w[49], w[50], selector); w[49] = __byte_perm (w[48], w[49], selector); w[48] = __byte_perm (w[47], w[48], selector); w[47] = __byte_perm (w[46], w[47], selector); w[46] = __byte_perm (w[45], w[46], selector); w[45] = __byte_perm (w[44], w[45], selector); w[44] = __byte_perm (w[43], w[44], selector); w[43] = __byte_perm (w[42], w[43], selector); w[42] = __byte_perm (w[41], w[42], selector); w[41] = __byte_perm (w[40], w[41], selector); w[40] = __byte_perm (w[39], w[40], selector); w[39] = __byte_perm (w[38], w[39], selector); w[38] = __byte_perm (w[37], w[38], selector); w[37] = __byte_perm (w[36], w[37], selector); w[36] = __byte_perm (w[35], w[36], selector); w[35] = __byte_perm (w[34], w[35], selector); w[34] = __byte_perm (w[33], w[34], selector); w[33] = __byte_perm (w[32], w[33], selector); w[32] = __byte_perm (w[31], w[32], selector); w[31] = __byte_perm (w[30], w[31], selector); w[30] = __byte_perm (w[29], w[30], selector); w[29] = __byte_perm (w[28], w[29], selector); w[28] = __byte_perm (w[27], w[28], selector); w[27] = __byte_perm (w[26], w[27], selector); w[26] = __byte_perm (w[25], w[26], selector); w[25] = __byte_perm (w[24], w[25], selector); w[24] = __byte_perm (w[23], w[24], selector); w[23] = __byte_perm (w[22], w[23], selector); w[22] = __byte_perm (w[21], w[22], selector); w[21] = __byte_perm (w[20], w[21], selector); w[20] = __byte_perm (w[19], w[20], selector); w[19] = __byte_perm (w[18], w[19], selector); w[18] = __byte_perm (w[17], w[18], selector); w[17] = __byte_perm (w[16], w[17], selector); w[16] = __byte_perm (w[15], w[16], selector); w[15] = __byte_perm (w[14], w[15], selector); w[14] = __byte_perm (w[13], w[14], selector); w[13] = __byte_perm (w[12], w[13], selector); w[12] = __byte_perm (w[11], w[12], selector); w[11] = __byte_perm (w[10], w[11], selector); w[10] = __byte_perm (w[ 9], w[10], selector); w[ 9] = __byte_perm (w[ 8], w[ 9], selector); w[ 8] = __byte_perm (w[ 7], w[ 8], selector); w[ 7] = __byte_perm (w[ 6], w[ 7], selector); w[ 6] = __byte_perm (w[ 5], w[ 6], selector); w[ 5] = __byte_perm (w[ 4], w[ 5], selector); w[ 4] = __byte_perm (w[ 3], w[ 4], selector); w[ 3] = __byte_perm (w[ 2], w[ 3], selector); w[ 2] = __byte_perm (w[ 1], w[ 2], selector); w[ 1] = __byte_perm (w[ 0], w[ 1], selector); w[ 0] = __byte_perm ( 0, w[ 0], selector); break; case 1: w[63] = __byte_perm (w[61], w[62], selector); w[62] = __byte_perm (w[60], w[61], selector); w[61] = __byte_perm (w[59], w[60], selector); w[60] = __byte_perm (w[58], w[59], selector); w[59] = __byte_perm (w[57], w[58], selector); w[58] = __byte_perm (w[56], w[57], selector); w[57] = __byte_perm (w[55], w[56], selector); w[56] = __byte_perm (w[54], w[55], selector); w[55] = __byte_perm (w[53], w[54], selector); w[54] = __byte_perm (w[52], w[53], selector); w[53] = __byte_perm (w[51], w[52], selector); w[52] = __byte_perm (w[50], w[51], selector); w[51] = __byte_perm (w[49], w[50], selector); w[50] = __byte_perm (w[48], w[49], selector); w[49] = __byte_perm (w[47], w[48], selector); w[48] = __byte_perm (w[46], w[47], selector); w[47] = __byte_perm (w[45], w[46], selector); w[46] = __byte_perm (w[44], w[45], selector); w[45] = __byte_perm (w[43], w[44], selector); w[44] = __byte_perm (w[42], w[43], selector); w[43] = __byte_perm (w[41], w[42], selector); w[42] = __byte_perm (w[40], w[41], selector); w[41] = __byte_perm (w[39], w[40], selector); w[40] = __byte_perm (w[38], w[39], selector); w[39] = __byte_perm (w[37], w[38], selector); w[38] = __byte_perm (w[36], w[37], selector); w[37] = __byte_perm (w[35], w[36], selector); w[36] = __byte_perm (w[34], w[35], selector); w[35] = __byte_perm (w[33], w[34], selector); w[34] = __byte_perm (w[32], w[33], selector); w[33] = __byte_perm (w[31], w[32], selector); w[32] = __byte_perm (w[30], w[31], selector); w[31] = __byte_perm (w[29], w[30], selector); w[30] = __byte_perm (w[28], w[29], selector); w[29] = __byte_perm (w[27], w[28], selector); w[28] = __byte_perm (w[26], w[27], selector); w[27] = __byte_perm (w[25], w[26], selector); w[26] = __byte_perm (w[24], w[25], selector); w[25] = __byte_perm (w[23], w[24], selector); w[24] = __byte_perm (w[22], w[23], selector); w[23] = __byte_perm (w[21], w[22], selector); w[22] = __byte_perm (w[20], w[21], selector); w[21] = __byte_perm (w[19], w[20], selector); w[20] = __byte_perm (w[18], w[19], selector); w[19] = __byte_perm (w[17], w[18], selector); w[18] = __byte_perm (w[16], w[17], selector); w[17] = __byte_perm (w[15], w[16], selector); w[16] = __byte_perm (w[14], w[15], selector); w[15] = __byte_perm (w[13], w[14], selector); w[14] = __byte_perm (w[12], w[13], selector); w[13] = __byte_perm (w[11], w[12], selector); w[12] = __byte_perm (w[10], w[11], selector); w[11] = __byte_perm (w[ 9], w[10], selector); w[10] = __byte_perm (w[ 8], w[ 9], selector); w[ 9] = __byte_perm (w[ 7], w[ 8], selector); w[ 8] = __byte_perm (w[ 6], w[ 7], selector); w[ 7] = __byte_perm (w[ 5], w[ 6], selector); w[ 6] = __byte_perm (w[ 4], w[ 5], selector); w[ 5] = __byte_perm (w[ 3], w[ 4], selector); w[ 4] = __byte_perm (w[ 2], w[ 3], selector); w[ 3] = __byte_perm (w[ 1], w[ 2], selector); w[ 2] = __byte_perm (w[ 0], w[ 1], selector); w[ 1] = __byte_perm ( 0, w[ 0], selector); w[ 0] = 0; break; case 2: w[63] = __byte_perm (w[60], w[61], selector); w[62] = __byte_perm (w[59], w[60], selector); w[61] = __byte_perm (w[58], w[59], selector); w[60] = __byte_perm (w[57], w[58], selector); w[59] = __byte_perm (w[56], w[57], selector); w[58] = __byte_perm (w[55], w[56], selector); w[57] = __byte_perm (w[54], w[55], selector); w[56] = __byte_perm (w[53], w[54], selector); w[55] = __byte_perm (w[52], w[53], selector); w[54] = __byte_perm (w[51], w[52], selector); w[53] = __byte_perm (w[50], w[51], selector); w[52] = __byte_perm (w[49], w[50], selector); w[51] = __byte_perm (w[48], w[49], selector); w[50] = __byte_perm (w[47], w[48], selector); w[49] = __byte_perm (w[46], w[47], selector); w[48] = __byte_perm (w[45], w[46], selector); w[47] = __byte_perm (w[44], w[45], selector); w[46] = __byte_perm (w[43], w[44], selector); w[45] = __byte_perm (w[42], w[43], selector); w[44] = __byte_perm (w[41], w[42], selector); w[43] = __byte_perm (w[40], w[41], selector); w[42] = __byte_perm (w[39], w[40], selector); w[41] = __byte_perm (w[38], w[39], selector); w[40] = __byte_perm (w[37], w[38], selector); w[39] = __byte_perm (w[36], w[37], selector); w[38] = __byte_perm (w[35], w[36], selector); w[37] = __byte_perm (w[34], w[35], selector); w[36] = __byte_perm (w[33], w[34], selector); w[35] = __byte_perm (w[32], w[33], selector); w[34] = __byte_perm (w[31], w[32], selector); w[33] = __byte_perm (w[30], w[31], selector); w[32] = __byte_perm (w[29], w[30], selector); w[31] = __byte_perm (w[28], w[29], selector); w[30] = __byte_perm (w[27], w[28], selector); w[29] = __byte_perm (w[26], w[27], selector); w[28] = __byte_perm (w[25], w[26], selector); w[27] = __byte_perm (w[24], w[25], selector); w[26] = __byte_perm (w[23], w[24], selector); w[25] = __byte_perm (w[22], w[23], selector); w[24] = __byte_perm (w[21], w[22], selector); w[23] = __byte_perm (w[20], w[21], selector); w[22] = __byte_perm (w[19], w[20], selector); w[21] = __byte_perm (w[18], w[19], selector); w[20] = __byte_perm (w[17], w[18], selector); w[19] = __byte_perm (w[16], w[17], selector); w[18] = __byte_perm (w[15], w[16], selector); w[17] = __byte_perm (w[14], w[15], selector); w[16] = __byte_perm (w[13], w[14], selector); w[15] = __byte_perm (w[12], w[13], selector); w[14] = __byte_perm (w[11], w[12], selector); w[13] = __byte_perm (w[10], w[11], selector); w[12] = __byte_perm (w[ 9], w[10], selector); w[11] = __byte_perm (w[ 8], w[ 9], selector); w[10] = __byte_perm (w[ 7], w[ 8], selector); w[ 9] = __byte_perm (w[ 6], w[ 7], selector); w[ 8] = __byte_perm (w[ 5], w[ 6], selector); w[ 7] = __byte_perm (w[ 4], w[ 5], selector); w[ 6] = __byte_perm (w[ 3], w[ 4], selector); w[ 5] = __byte_perm (w[ 2], w[ 3], selector); w[ 4] = __byte_perm (w[ 1], w[ 2], selector); w[ 3] = __byte_perm (w[ 0], w[ 1], selector); w[ 2] = __byte_perm ( 0, w[ 0], selector); w[ 1] = 0; w[ 0] = 0; break; case 3: w[63] = __byte_perm (w[59], w[60], selector); w[62] = __byte_perm (w[58], w[59], selector); w[61] = __byte_perm (w[57], w[58], selector); w[60] = __byte_perm (w[56], w[57], selector); w[59] = __byte_perm (w[55], w[56], selector); w[58] = __byte_perm (w[54], w[55], selector); w[57] = __byte_perm (w[53], w[54], selector); w[56] = __byte_perm (w[52], w[53], selector); w[55] = __byte_perm (w[51], w[52], selector); w[54] = __byte_perm (w[50], w[51], selector); w[53] = __byte_perm (w[49], w[50], selector); w[52] = __byte_perm (w[48], w[49], selector); w[51] = __byte_perm (w[47], w[48], selector); w[50] = __byte_perm (w[46], w[47], selector); w[49] = __byte_perm (w[45], w[46], selector); w[48] = __byte_perm (w[44], w[45], selector); w[47] = __byte_perm (w[43], w[44], selector); w[46] = __byte_perm (w[42], w[43], selector); w[45] = __byte_perm (w[41], w[42], selector); w[44] = __byte_perm (w[40], w[41], selector); w[43] = __byte_perm (w[39], w[40], selector); w[42] = __byte_perm (w[38], w[39], selector); w[41] = __byte_perm (w[37], w[38], selector); w[40] = __byte_perm (w[36], w[37], selector); w[39] = __byte_perm (w[35], w[36], selector); w[38] = __byte_perm (w[34], w[35], selector); w[37] = __byte_perm (w[33], w[34], selector); w[36] = __byte_perm (w[32], w[33], selector); w[35] = __byte_perm (w[31], w[32], selector); w[34] = __byte_perm (w[30], w[31], selector); w[33] = __byte_perm (w[29], w[30], selector); w[32] = __byte_perm (w[28], w[29], selector); w[31] = __byte_perm (w[27], w[28], selector); w[30] = __byte_perm (w[26], w[27], selector); w[29] = __byte_perm (w[25], w[26], selector); w[28] = __byte_perm (w[24], w[25], selector); w[27] = __byte_perm (w[23], w[24], selector); w[26] = __byte_perm (w[22], w[23], selector); w[25] = __byte_perm (w[21], w[22], selector); w[24] = __byte_perm (w[20], w[21], selector); w[23] = __byte_perm (w[19], w[20], selector); w[22] = __byte_perm (w[18], w[19], selector); w[21] = __byte_perm (w[17], w[18], selector); w[20] = __byte_perm (w[16], w[17], selector); w[19] = __byte_perm (w[15], w[16], selector); w[18] = __byte_perm (w[14], w[15], selector); w[17] = __byte_perm (w[13], w[14], selector); w[16] = __byte_perm (w[12], w[13], selector); w[15] = __byte_perm (w[11], w[12], selector); w[14] = __byte_perm (w[10], w[11], selector); w[13] = __byte_perm (w[ 9], w[10], selector); w[12] = __byte_perm (w[ 8], w[ 9], selector); w[11] = __byte_perm (w[ 7], w[ 8], selector); w[10] = __byte_perm (w[ 6], w[ 7], selector); w[ 9] = __byte_perm (w[ 5], w[ 6], selector); w[ 8] = __byte_perm (w[ 4], w[ 5], selector); w[ 7] = __byte_perm (w[ 3], w[ 4], selector); w[ 6] = __byte_perm (w[ 2], w[ 3], selector); w[ 5] = __byte_perm (w[ 1], w[ 2], selector); w[ 4] = __byte_perm (w[ 0], w[ 1], selector); w[ 3] = __byte_perm ( 0, w[ 0], selector); w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 4: w[63] = __byte_perm (w[58], w[59], selector); w[62] = __byte_perm (w[57], w[58], selector); w[61] = __byte_perm (w[56], w[57], selector); w[60] = __byte_perm (w[55], w[56], selector); w[59] = __byte_perm (w[54], w[55], selector); w[58] = __byte_perm (w[53], w[54], selector); w[57] = __byte_perm (w[52], w[53], selector); w[56] = __byte_perm (w[51], w[52], selector); w[55] = __byte_perm (w[50], w[51], selector); w[54] = __byte_perm (w[49], w[50], selector); w[53] = __byte_perm (w[48], w[49], selector); w[52] = __byte_perm (w[47], w[48], selector); w[51] = __byte_perm (w[46], w[47], selector); w[50] = __byte_perm (w[45], w[46], selector); w[49] = __byte_perm (w[44], w[45], selector); w[48] = __byte_perm (w[43], w[44], selector); w[47] = __byte_perm (w[42], w[43], selector); w[46] = __byte_perm (w[41], w[42], selector); w[45] = __byte_perm (w[40], w[41], selector); w[44] = __byte_perm (w[39], w[40], selector); w[43] = __byte_perm (w[38], w[39], selector); w[42] = __byte_perm (w[37], w[38], selector); w[41] = __byte_perm (w[36], w[37], selector); w[40] = __byte_perm (w[35], w[36], selector); w[39] = __byte_perm (w[34], w[35], selector); w[38] = __byte_perm (w[33], w[34], selector); w[37] = __byte_perm (w[32], w[33], selector); w[36] = __byte_perm (w[31], w[32], selector); w[35] = __byte_perm (w[30], w[31], selector); w[34] = __byte_perm (w[29], w[30], selector); w[33] = __byte_perm (w[28], w[29], selector); w[32] = __byte_perm (w[27], w[28], selector); w[31] = __byte_perm (w[26], w[27], selector); w[30] = __byte_perm (w[25], w[26], selector); w[29] = __byte_perm (w[24], w[25], selector); w[28] = __byte_perm (w[23], w[24], selector); w[27] = __byte_perm (w[22], w[23], selector); w[26] = __byte_perm (w[21], w[22], selector); w[25] = __byte_perm (w[20], w[21], selector); w[24] = __byte_perm (w[19], w[20], selector); w[23] = __byte_perm (w[18], w[19], selector); w[22] = __byte_perm (w[17], w[18], selector); w[21] = __byte_perm (w[16], w[17], selector); w[20] = __byte_perm (w[15], w[16], selector); w[19] = __byte_perm (w[14], w[15], selector); w[18] = __byte_perm (w[13], w[14], selector); w[17] = __byte_perm (w[12], w[13], selector); w[16] = __byte_perm (w[11], w[12], selector); w[15] = __byte_perm (w[10], w[11], selector); w[14] = __byte_perm (w[ 9], w[10], selector); w[13] = __byte_perm (w[ 8], w[ 9], selector); w[12] = __byte_perm (w[ 7], w[ 8], selector); w[11] = __byte_perm (w[ 6], w[ 7], selector); w[10] = __byte_perm (w[ 5], w[ 6], selector); w[ 9] = __byte_perm (w[ 4], w[ 5], selector); w[ 8] = __byte_perm (w[ 3], w[ 4], selector); w[ 7] = __byte_perm (w[ 2], w[ 3], selector); w[ 6] = __byte_perm (w[ 1], w[ 2], selector); w[ 5] = __byte_perm (w[ 0], w[ 1], selector); w[ 4] = __byte_perm ( 0, w[ 0], selector); w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 5: w[63] = __byte_perm (w[57], w[58], selector); w[62] = __byte_perm (w[56], w[57], selector); w[61] = __byte_perm (w[55], w[56], selector); w[60] = __byte_perm (w[54], w[55], selector); w[59] = __byte_perm (w[53], w[54], selector); w[58] = __byte_perm (w[52], w[53], selector); w[57] = __byte_perm (w[51], w[52], selector); w[56] = __byte_perm (w[50], w[51], selector); w[55] = __byte_perm (w[49], w[50], selector); w[54] = __byte_perm (w[48], w[49], selector); w[53] = __byte_perm (w[47], w[48], selector); w[52] = __byte_perm (w[46], w[47], selector); w[51] = __byte_perm (w[45], w[46], selector); w[50] = __byte_perm (w[44], w[45], selector); w[49] = __byte_perm (w[43], w[44], selector); w[48] = __byte_perm (w[42], w[43], selector); w[47] = __byte_perm (w[41], w[42], selector); w[46] = __byte_perm (w[40], w[41], selector); w[45] = __byte_perm (w[39], w[40], selector); w[44] = __byte_perm (w[38], w[39], selector); w[43] = __byte_perm (w[37], w[38], selector); w[42] = __byte_perm (w[36], w[37], selector); w[41] = __byte_perm (w[35], w[36], selector); w[40] = __byte_perm (w[34], w[35], selector); w[39] = __byte_perm (w[33], w[34], selector); w[38] = __byte_perm (w[32], w[33], selector); w[37] = __byte_perm (w[31], w[32], selector); w[36] = __byte_perm (w[30], w[31], selector); w[35] = __byte_perm (w[29], w[30], selector); w[34] = __byte_perm (w[28], w[29], selector); w[33] = __byte_perm (w[27], w[28], selector); w[32] = __byte_perm (w[26], w[27], selector); w[31] = __byte_perm (w[25], w[26], selector); w[30] = __byte_perm (w[24], w[25], selector); w[29] = __byte_perm (w[23], w[24], selector); w[28] = __byte_perm (w[22], w[23], selector); w[27] = __byte_perm (w[21], w[22], selector); w[26] = __byte_perm (w[20], w[21], selector); w[25] = __byte_perm (w[19], w[20], selector); w[24] = __byte_perm (w[18], w[19], selector); w[23] = __byte_perm (w[17], w[18], selector); w[22] = __byte_perm (w[16], w[17], selector); w[21] = __byte_perm (w[15], w[16], selector); w[20] = __byte_perm (w[14], w[15], selector); w[19] = __byte_perm (w[13], w[14], selector); w[18] = __byte_perm (w[12], w[13], selector); w[17] = __byte_perm (w[11], w[12], selector); w[16] = __byte_perm (w[10], w[11], selector); w[15] = __byte_perm (w[ 9], w[10], selector); w[14] = __byte_perm (w[ 8], w[ 9], selector); w[13] = __byte_perm (w[ 7], w[ 8], selector); w[12] = __byte_perm (w[ 6], w[ 7], selector); w[11] = __byte_perm (w[ 5], w[ 6], selector); w[10] = __byte_perm (w[ 4], w[ 5], selector); w[ 9] = __byte_perm (w[ 3], w[ 4], selector); w[ 8] = __byte_perm (w[ 2], w[ 3], selector); w[ 7] = __byte_perm (w[ 1], w[ 2], selector); w[ 6] = __byte_perm (w[ 0], w[ 1], selector); w[ 5] = __byte_perm ( 0, w[ 0], selector); w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 6: w[63] = __byte_perm (w[56], w[57], selector); w[62] = __byte_perm (w[55], w[56], selector); w[61] = __byte_perm (w[54], w[55], selector); w[60] = __byte_perm (w[53], w[54], selector); w[59] = __byte_perm (w[52], w[53], selector); w[58] = __byte_perm (w[51], w[52], selector); w[57] = __byte_perm (w[50], w[51], selector); w[56] = __byte_perm (w[49], w[50], selector); w[55] = __byte_perm (w[48], w[49], selector); w[54] = __byte_perm (w[47], w[48], selector); w[53] = __byte_perm (w[46], w[47], selector); w[52] = __byte_perm (w[45], w[46], selector); w[51] = __byte_perm (w[44], w[45], selector); w[50] = __byte_perm (w[43], w[44], selector); w[49] = __byte_perm (w[42], w[43], selector); w[48] = __byte_perm (w[41], w[42], selector); w[47] = __byte_perm (w[40], w[41], selector); w[46] = __byte_perm (w[39], w[40], selector); w[45] = __byte_perm (w[38], w[39], selector); w[44] = __byte_perm (w[37], w[38], selector); w[43] = __byte_perm (w[36], w[37], selector); w[42] = __byte_perm (w[35], w[36], selector); w[41] = __byte_perm (w[34], w[35], selector); w[40] = __byte_perm (w[33], w[34], selector); w[39] = __byte_perm (w[32], w[33], selector); w[38] = __byte_perm (w[31], w[32], selector); w[37] = __byte_perm (w[30], w[31], selector); w[36] = __byte_perm (w[29], w[30], selector); w[35] = __byte_perm (w[28], w[29], selector); w[34] = __byte_perm (w[27], w[28], selector); w[33] = __byte_perm (w[26], w[27], selector); w[32] = __byte_perm (w[25], w[26], selector); w[31] = __byte_perm (w[24], w[25], selector); w[30] = __byte_perm (w[23], w[24], selector); w[29] = __byte_perm (w[22], w[23], selector); w[28] = __byte_perm (w[21], w[22], selector); w[27] = __byte_perm (w[20], w[21], selector); w[26] = __byte_perm (w[19], w[20], selector); w[25] = __byte_perm (w[18], w[19], selector); w[24] = __byte_perm (w[17], w[18], selector); w[23] = __byte_perm (w[16], w[17], selector); w[22] = __byte_perm (w[15], w[16], selector); w[21] = __byte_perm (w[14], w[15], selector); w[20] = __byte_perm (w[13], w[14], selector); w[19] = __byte_perm (w[12], w[13], selector); w[18] = __byte_perm (w[11], w[12], selector); w[17] = __byte_perm (w[10], w[11], selector); w[16] = __byte_perm (w[ 9], w[10], selector); w[15] = __byte_perm (w[ 8], w[ 9], selector); w[14] = __byte_perm (w[ 7], w[ 8], selector); w[13] = __byte_perm (w[ 6], w[ 7], selector); w[12] = __byte_perm (w[ 5], w[ 6], selector); w[11] = __byte_perm (w[ 4], w[ 5], selector); w[10] = __byte_perm (w[ 3], w[ 4], selector); w[ 9] = __byte_perm (w[ 2], w[ 3], selector); w[ 8] = __byte_perm (w[ 1], w[ 2], selector); w[ 7] = __byte_perm (w[ 0], w[ 1], selector); w[ 6] = __byte_perm ( 0, w[ 0], selector); w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 7: w[63] = __byte_perm (w[55], w[56], selector); w[62] = __byte_perm (w[54], w[55], selector); w[61] = __byte_perm (w[53], w[54], selector); w[60] = __byte_perm (w[52], w[53], selector); w[59] = __byte_perm (w[51], w[52], selector); w[58] = __byte_perm (w[50], w[51], selector); w[57] = __byte_perm (w[49], w[50], selector); w[56] = __byte_perm (w[48], w[49], selector); w[55] = __byte_perm (w[47], w[48], selector); w[54] = __byte_perm (w[46], w[47], selector); w[53] = __byte_perm (w[45], w[46], selector); w[52] = __byte_perm (w[44], w[45], selector); w[51] = __byte_perm (w[43], w[44], selector); w[50] = __byte_perm (w[42], w[43], selector); w[49] = __byte_perm (w[41], w[42], selector); w[48] = __byte_perm (w[40], w[41], selector); w[47] = __byte_perm (w[39], w[40], selector); w[46] = __byte_perm (w[38], w[39], selector); w[45] = __byte_perm (w[37], w[38], selector); w[44] = __byte_perm (w[36], w[37], selector); w[43] = __byte_perm (w[35], w[36], selector); w[42] = __byte_perm (w[34], w[35], selector); w[41] = __byte_perm (w[33], w[34], selector); w[40] = __byte_perm (w[32], w[33], selector); w[39] = __byte_perm (w[31], w[32], selector); w[38] = __byte_perm (w[30], w[31], selector); w[37] = __byte_perm (w[29], w[30], selector); w[36] = __byte_perm (w[28], w[29], selector); w[35] = __byte_perm (w[27], w[28], selector); w[34] = __byte_perm (w[26], w[27], selector); w[33] = __byte_perm (w[25], w[26], selector); w[32] = __byte_perm (w[24], w[25], selector); w[31] = __byte_perm (w[23], w[24], selector); w[30] = __byte_perm (w[22], w[23], selector); w[29] = __byte_perm (w[21], w[22], selector); w[28] = __byte_perm (w[20], w[21], selector); w[27] = __byte_perm (w[19], w[20], selector); w[26] = __byte_perm (w[18], w[19], selector); w[25] = __byte_perm (w[17], w[18], selector); w[24] = __byte_perm (w[16], w[17], selector); w[23] = __byte_perm (w[15], w[16], selector); w[22] = __byte_perm (w[14], w[15], selector); w[21] = __byte_perm (w[13], w[14], selector); w[20] = __byte_perm (w[12], w[13], selector); w[19] = __byte_perm (w[11], w[12], selector); w[18] = __byte_perm (w[10], w[11], selector); w[17] = __byte_perm (w[ 9], w[10], selector); w[16] = __byte_perm (w[ 8], w[ 9], selector); w[15] = __byte_perm (w[ 7], w[ 8], selector); w[14] = __byte_perm (w[ 6], w[ 7], selector); w[13] = __byte_perm (w[ 5], w[ 6], selector); w[12] = __byte_perm (w[ 4], w[ 5], selector); w[11] = __byte_perm (w[ 3], w[ 4], selector); w[10] = __byte_perm (w[ 2], w[ 3], selector); w[ 9] = __byte_perm (w[ 1], w[ 2], selector); w[ 8] = __byte_perm (w[ 0], w[ 1], selector); w[ 7] = __byte_perm ( 0, w[ 0], selector); w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 8: w[63] = __byte_perm (w[54], w[55], selector); w[62] = __byte_perm (w[53], w[54], selector); w[61] = __byte_perm (w[52], w[53], selector); w[60] = __byte_perm (w[51], w[52], selector); w[59] = __byte_perm (w[50], w[51], selector); w[58] = __byte_perm (w[49], w[50], selector); w[57] = __byte_perm (w[48], w[49], selector); w[56] = __byte_perm (w[47], w[48], selector); w[55] = __byte_perm (w[46], w[47], selector); w[54] = __byte_perm (w[45], w[46], selector); w[53] = __byte_perm (w[44], w[45], selector); w[52] = __byte_perm (w[43], w[44], selector); w[51] = __byte_perm (w[42], w[43], selector); w[50] = __byte_perm (w[41], w[42], selector); w[49] = __byte_perm (w[40], w[41], selector); w[48] = __byte_perm (w[39], w[40], selector); w[47] = __byte_perm (w[38], w[39], selector); w[46] = __byte_perm (w[37], w[38], selector); w[45] = __byte_perm (w[36], w[37], selector); w[44] = __byte_perm (w[35], w[36], selector); w[43] = __byte_perm (w[34], w[35], selector); w[42] = __byte_perm (w[33], w[34], selector); w[41] = __byte_perm (w[32], w[33], selector); w[40] = __byte_perm (w[31], w[32], selector); w[39] = __byte_perm (w[30], w[31], selector); w[38] = __byte_perm (w[29], w[30], selector); w[37] = __byte_perm (w[28], w[29], selector); w[36] = __byte_perm (w[27], w[28], selector); w[35] = __byte_perm (w[26], w[27], selector); w[34] = __byte_perm (w[25], w[26], selector); w[33] = __byte_perm (w[24], w[25], selector); w[32] = __byte_perm (w[23], w[24], selector); w[31] = __byte_perm (w[22], w[23], selector); w[30] = __byte_perm (w[21], w[22], selector); w[29] = __byte_perm (w[20], w[21], selector); w[28] = __byte_perm (w[19], w[20], selector); w[27] = __byte_perm (w[18], w[19], selector); w[26] = __byte_perm (w[17], w[18], selector); w[25] = __byte_perm (w[16], w[17], selector); w[24] = __byte_perm (w[15], w[16], selector); w[23] = __byte_perm (w[14], w[15], selector); w[22] = __byte_perm (w[13], w[14], selector); w[21] = __byte_perm (w[12], w[13], selector); w[20] = __byte_perm (w[11], w[12], selector); w[19] = __byte_perm (w[10], w[11], selector); w[18] = __byte_perm (w[ 9], w[10], selector); w[17] = __byte_perm (w[ 8], w[ 9], selector); w[16] = __byte_perm (w[ 7], w[ 8], selector); w[15] = __byte_perm (w[ 6], w[ 7], selector); w[14] = __byte_perm (w[ 5], w[ 6], selector); w[13] = __byte_perm (w[ 4], w[ 5], selector); w[12] = __byte_perm (w[ 3], w[ 4], selector); w[11] = __byte_perm (w[ 2], w[ 3], selector); w[10] = __byte_perm (w[ 1], w[ 2], selector); w[ 9] = __byte_perm (w[ 0], w[ 1], selector); w[ 8] = __byte_perm ( 0, w[ 0], selector); w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 9: w[63] = __byte_perm (w[53], w[54], selector); w[62] = __byte_perm (w[52], w[53], selector); w[61] = __byte_perm (w[51], w[52], selector); w[60] = __byte_perm (w[50], w[51], selector); w[59] = __byte_perm (w[49], w[50], selector); w[58] = __byte_perm (w[48], w[49], selector); w[57] = __byte_perm (w[47], w[48], selector); w[56] = __byte_perm (w[46], w[47], selector); w[55] = __byte_perm (w[45], w[46], selector); w[54] = __byte_perm (w[44], w[45], selector); w[53] = __byte_perm (w[43], w[44], selector); w[52] = __byte_perm (w[42], w[43], selector); w[51] = __byte_perm (w[41], w[42], selector); w[50] = __byte_perm (w[40], w[41], selector); w[49] = __byte_perm (w[39], w[40], selector); w[48] = __byte_perm (w[38], w[39], selector); w[47] = __byte_perm (w[37], w[38], selector); w[46] = __byte_perm (w[36], w[37], selector); w[45] = __byte_perm (w[35], w[36], selector); w[44] = __byte_perm (w[34], w[35], selector); w[43] = __byte_perm (w[33], w[34], selector); w[42] = __byte_perm (w[32], w[33], selector); w[41] = __byte_perm (w[31], w[32], selector); w[40] = __byte_perm (w[30], w[31], selector); w[39] = __byte_perm (w[29], w[30], selector); w[38] = __byte_perm (w[28], w[29], selector); w[37] = __byte_perm (w[27], w[28], selector); w[36] = __byte_perm (w[26], w[27], selector); w[35] = __byte_perm (w[25], w[26], selector); w[34] = __byte_perm (w[24], w[25], selector); w[33] = __byte_perm (w[23], w[24], selector); w[32] = __byte_perm (w[22], w[23], selector); w[31] = __byte_perm (w[21], w[22], selector); w[30] = __byte_perm (w[20], w[21], selector); w[29] = __byte_perm (w[19], w[20], selector); w[28] = __byte_perm (w[18], w[19], selector); w[27] = __byte_perm (w[17], w[18], selector); w[26] = __byte_perm (w[16], w[17], selector); w[25] = __byte_perm (w[15], w[16], selector); w[24] = __byte_perm (w[14], w[15], selector); w[23] = __byte_perm (w[13], w[14], selector); w[22] = __byte_perm (w[12], w[13], selector); w[21] = __byte_perm (w[11], w[12], selector); w[20] = __byte_perm (w[10], w[11], selector); w[19] = __byte_perm (w[ 9], w[10], selector); w[18] = __byte_perm (w[ 8], w[ 9], selector); w[17] = __byte_perm (w[ 7], w[ 8], selector); w[16] = __byte_perm (w[ 6], w[ 7], selector); w[15] = __byte_perm (w[ 5], w[ 6], selector); w[14] = __byte_perm (w[ 4], w[ 5], selector); w[13] = __byte_perm (w[ 3], w[ 4], selector); w[12] = __byte_perm (w[ 2], w[ 3], selector); w[11] = __byte_perm (w[ 1], w[ 2], selector); w[10] = __byte_perm (w[ 0], w[ 1], selector); w[ 9] = __byte_perm ( 0, w[ 0], selector); w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 10: w[63] = __byte_perm (w[52], w[53], selector); w[62] = __byte_perm (w[51], w[52], selector); w[61] = __byte_perm (w[50], w[51], selector); w[60] = __byte_perm (w[49], w[50], selector); w[59] = __byte_perm (w[48], w[49], selector); w[58] = __byte_perm (w[47], w[48], selector); w[57] = __byte_perm (w[46], w[47], selector); w[56] = __byte_perm (w[45], w[46], selector); w[55] = __byte_perm (w[44], w[45], selector); w[54] = __byte_perm (w[43], w[44], selector); w[53] = __byte_perm (w[42], w[43], selector); w[52] = __byte_perm (w[41], w[42], selector); w[51] = __byte_perm (w[40], w[41], selector); w[50] = __byte_perm (w[39], w[40], selector); w[49] = __byte_perm (w[38], w[39], selector); w[48] = __byte_perm (w[37], w[38], selector); w[47] = __byte_perm (w[36], w[37], selector); w[46] = __byte_perm (w[35], w[36], selector); w[45] = __byte_perm (w[34], w[35], selector); w[44] = __byte_perm (w[33], w[34], selector); w[43] = __byte_perm (w[32], w[33], selector); w[42] = __byte_perm (w[31], w[32], selector); w[41] = __byte_perm (w[30], w[31], selector); w[40] = __byte_perm (w[29], w[30], selector); w[39] = __byte_perm (w[28], w[29], selector); w[38] = __byte_perm (w[27], w[28], selector); w[37] = __byte_perm (w[26], w[27], selector); w[36] = __byte_perm (w[25], w[26], selector); w[35] = __byte_perm (w[24], w[25], selector); w[34] = __byte_perm (w[23], w[24], selector); w[33] = __byte_perm (w[22], w[23], selector); w[32] = __byte_perm (w[21], w[22], selector); w[31] = __byte_perm (w[20], w[21], selector); w[30] = __byte_perm (w[19], w[20], selector); w[29] = __byte_perm (w[18], w[19], selector); w[28] = __byte_perm (w[17], w[18], selector); w[27] = __byte_perm (w[16], w[17], selector); w[26] = __byte_perm (w[15], w[16], selector); w[25] = __byte_perm (w[14], w[15], selector); w[24] = __byte_perm (w[13], w[14], selector); w[23] = __byte_perm (w[12], w[13], selector); w[22] = __byte_perm (w[11], w[12], selector); w[21] = __byte_perm (w[10], w[11], selector); w[20] = __byte_perm (w[ 9], w[10], selector); w[19] = __byte_perm (w[ 8], w[ 9], selector); w[18] = __byte_perm (w[ 7], w[ 8], selector); w[17] = __byte_perm (w[ 6], w[ 7], selector); w[16] = __byte_perm (w[ 5], w[ 6], selector); w[15] = __byte_perm (w[ 4], w[ 5], selector); w[14] = __byte_perm (w[ 3], w[ 4], selector); w[13] = __byte_perm (w[ 2], w[ 3], selector); w[12] = __byte_perm (w[ 1], w[ 2], selector); w[11] = __byte_perm (w[ 0], w[ 1], selector); w[10] = __byte_perm ( 0, w[ 0], selector); w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 11: w[63] = __byte_perm (w[51], w[52], selector); w[62] = __byte_perm (w[50], w[51], selector); w[61] = __byte_perm (w[49], w[50], selector); w[60] = __byte_perm (w[48], w[49], selector); w[59] = __byte_perm (w[47], w[48], selector); w[58] = __byte_perm (w[46], w[47], selector); w[57] = __byte_perm (w[45], w[46], selector); w[56] = __byte_perm (w[44], w[45], selector); w[55] = __byte_perm (w[43], w[44], selector); w[54] = __byte_perm (w[42], w[43], selector); w[53] = __byte_perm (w[41], w[42], selector); w[52] = __byte_perm (w[40], w[41], selector); w[51] = __byte_perm (w[39], w[40], selector); w[50] = __byte_perm (w[38], w[39], selector); w[49] = __byte_perm (w[37], w[38], selector); w[48] = __byte_perm (w[36], w[37], selector); w[47] = __byte_perm (w[35], w[36], selector); w[46] = __byte_perm (w[34], w[35], selector); w[45] = __byte_perm (w[33], w[34], selector); w[44] = __byte_perm (w[32], w[33], selector); w[43] = __byte_perm (w[31], w[32], selector); w[42] = __byte_perm (w[30], w[31], selector); w[41] = __byte_perm (w[29], w[30], selector); w[40] = __byte_perm (w[28], w[29], selector); w[39] = __byte_perm (w[27], w[28], selector); w[38] = __byte_perm (w[26], w[27], selector); w[37] = __byte_perm (w[25], w[26], selector); w[36] = __byte_perm (w[24], w[25], selector); w[35] = __byte_perm (w[23], w[24], selector); w[34] = __byte_perm (w[22], w[23], selector); w[33] = __byte_perm (w[21], w[22], selector); w[32] = __byte_perm (w[20], w[21], selector); w[31] = __byte_perm (w[19], w[20], selector); w[30] = __byte_perm (w[18], w[19], selector); w[29] = __byte_perm (w[17], w[18], selector); w[28] = __byte_perm (w[16], w[17], selector); w[27] = __byte_perm (w[15], w[16], selector); w[26] = __byte_perm (w[14], w[15], selector); w[25] = __byte_perm (w[13], w[14], selector); w[24] = __byte_perm (w[12], w[13], selector); w[23] = __byte_perm (w[11], w[12], selector); w[22] = __byte_perm (w[10], w[11], selector); w[21] = __byte_perm (w[ 9], w[10], selector); w[20] = __byte_perm (w[ 8], w[ 9], selector); w[19] = __byte_perm (w[ 7], w[ 8], selector); w[18] = __byte_perm (w[ 6], w[ 7], selector); w[17] = __byte_perm (w[ 5], w[ 6], selector); w[16] = __byte_perm (w[ 4], w[ 5], selector); w[15] = __byte_perm (w[ 3], w[ 4], selector); w[14] = __byte_perm (w[ 2], w[ 3], selector); w[13] = __byte_perm (w[ 1], w[ 2], selector); w[12] = __byte_perm (w[ 0], w[ 1], selector); w[11] = __byte_perm ( 0, w[ 0], selector); w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 12: w[63] = __byte_perm (w[50], w[51], selector); w[62] = __byte_perm (w[49], w[50], selector); w[61] = __byte_perm (w[48], w[49], selector); w[60] = __byte_perm (w[47], w[48], selector); w[59] = __byte_perm (w[46], w[47], selector); w[58] = __byte_perm (w[45], w[46], selector); w[57] = __byte_perm (w[44], w[45], selector); w[56] = __byte_perm (w[43], w[44], selector); w[55] = __byte_perm (w[42], w[43], selector); w[54] = __byte_perm (w[41], w[42], selector); w[53] = __byte_perm (w[40], w[41], selector); w[52] = __byte_perm (w[39], w[40], selector); w[51] = __byte_perm (w[38], w[39], selector); w[50] = __byte_perm (w[37], w[38], selector); w[49] = __byte_perm (w[36], w[37], selector); w[48] = __byte_perm (w[35], w[36], selector); w[47] = __byte_perm (w[34], w[35], selector); w[46] = __byte_perm (w[33], w[34], selector); w[45] = __byte_perm (w[32], w[33], selector); w[44] = __byte_perm (w[31], w[32], selector); w[43] = __byte_perm (w[30], w[31], selector); w[42] = __byte_perm (w[29], w[30], selector); w[41] = __byte_perm (w[28], w[29], selector); w[40] = __byte_perm (w[27], w[28], selector); w[39] = __byte_perm (w[26], w[27], selector); w[38] = __byte_perm (w[25], w[26], selector); w[37] = __byte_perm (w[24], w[25], selector); w[36] = __byte_perm (w[23], w[24], selector); w[35] = __byte_perm (w[22], w[23], selector); w[34] = __byte_perm (w[21], w[22], selector); w[33] = __byte_perm (w[20], w[21], selector); w[32] = __byte_perm (w[19], w[20], selector); w[31] = __byte_perm (w[18], w[19], selector); w[30] = __byte_perm (w[17], w[18], selector); w[29] = __byte_perm (w[16], w[17], selector); w[28] = __byte_perm (w[15], w[16], selector); w[27] = __byte_perm (w[14], w[15], selector); w[26] = __byte_perm (w[13], w[14], selector); w[25] = __byte_perm (w[12], w[13], selector); w[24] = __byte_perm (w[11], w[12], selector); w[23] = __byte_perm (w[10], w[11], selector); w[22] = __byte_perm (w[ 9], w[10], selector); w[21] = __byte_perm (w[ 8], w[ 9], selector); w[20] = __byte_perm (w[ 7], w[ 8], selector); w[19] = __byte_perm (w[ 6], w[ 7], selector); w[18] = __byte_perm (w[ 5], w[ 6], selector); w[17] = __byte_perm (w[ 4], w[ 5], selector); w[16] = __byte_perm (w[ 3], w[ 4], selector); w[15] = __byte_perm (w[ 2], w[ 3], selector); w[14] = __byte_perm (w[ 1], w[ 2], selector); w[13] = __byte_perm (w[ 0], w[ 1], selector); w[12] = __byte_perm ( 0, w[ 0], selector); w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 13: w[63] = __byte_perm (w[49], w[50], selector); w[62] = __byte_perm (w[48], w[49], selector); w[61] = __byte_perm (w[47], w[48], selector); w[60] = __byte_perm (w[46], w[47], selector); w[59] = __byte_perm (w[45], w[46], selector); w[58] = __byte_perm (w[44], w[45], selector); w[57] = __byte_perm (w[43], w[44], selector); w[56] = __byte_perm (w[42], w[43], selector); w[55] = __byte_perm (w[41], w[42], selector); w[54] = __byte_perm (w[40], w[41], selector); w[53] = __byte_perm (w[39], w[40], selector); w[52] = __byte_perm (w[38], w[39], selector); w[51] = __byte_perm (w[37], w[38], selector); w[50] = __byte_perm (w[36], w[37], selector); w[49] = __byte_perm (w[35], w[36], selector); w[48] = __byte_perm (w[34], w[35], selector); w[47] = __byte_perm (w[33], w[34], selector); w[46] = __byte_perm (w[32], w[33], selector); w[45] = __byte_perm (w[31], w[32], selector); w[44] = __byte_perm (w[30], w[31], selector); w[43] = __byte_perm (w[29], w[30], selector); w[42] = __byte_perm (w[28], w[29], selector); w[41] = __byte_perm (w[27], w[28], selector); w[40] = __byte_perm (w[26], w[27], selector); w[39] = __byte_perm (w[25], w[26], selector); w[38] = __byte_perm (w[24], w[25], selector); w[37] = __byte_perm (w[23], w[24], selector); w[36] = __byte_perm (w[22], w[23], selector); w[35] = __byte_perm (w[21], w[22], selector); w[34] = __byte_perm (w[20], w[21], selector); w[33] = __byte_perm (w[19], w[20], selector); w[32] = __byte_perm (w[18], w[19], selector); w[31] = __byte_perm (w[17], w[18], selector); w[30] = __byte_perm (w[16], w[17], selector); w[29] = __byte_perm (w[15], w[16], selector); w[28] = __byte_perm (w[14], w[15], selector); w[27] = __byte_perm (w[13], w[14], selector); w[26] = __byte_perm (w[12], w[13], selector); w[25] = __byte_perm (w[11], w[12], selector); w[24] = __byte_perm (w[10], w[11], selector); w[23] = __byte_perm (w[ 9], w[10], selector); w[22] = __byte_perm (w[ 8], w[ 9], selector); w[21] = __byte_perm (w[ 7], w[ 8], selector); w[20] = __byte_perm (w[ 6], w[ 7], selector); w[19] = __byte_perm (w[ 5], w[ 6], selector); w[18] = __byte_perm (w[ 4], w[ 5], selector); w[17] = __byte_perm (w[ 3], w[ 4], selector); w[16] = __byte_perm (w[ 2], w[ 3], selector); w[15] = __byte_perm (w[ 1], w[ 2], selector); w[14] = __byte_perm (w[ 0], w[ 1], selector); w[13] = __byte_perm ( 0, w[ 0], selector); w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 14: w[63] = __byte_perm (w[48], w[49], selector); w[62] = __byte_perm (w[47], w[48], selector); w[61] = __byte_perm (w[46], w[47], selector); w[60] = __byte_perm (w[45], w[46], selector); w[59] = __byte_perm (w[44], w[45], selector); w[58] = __byte_perm (w[43], w[44], selector); w[57] = __byte_perm (w[42], w[43], selector); w[56] = __byte_perm (w[41], w[42], selector); w[55] = __byte_perm (w[40], w[41], selector); w[54] = __byte_perm (w[39], w[40], selector); w[53] = __byte_perm (w[38], w[39], selector); w[52] = __byte_perm (w[37], w[38], selector); w[51] = __byte_perm (w[36], w[37], selector); w[50] = __byte_perm (w[35], w[36], selector); w[49] = __byte_perm (w[34], w[35], selector); w[48] = __byte_perm (w[33], w[34], selector); w[47] = __byte_perm (w[32], w[33], selector); w[46] = __byte_perm (w[31], w[32], selector); w[45] = __byte_perm (w[30], w[31], selector); w[44] = __byte_perm (w[29], w[30], selector); w[43] = __byte_perm (w[28], w[29], selector); w[42] = __byte_perm (w[27], w[28], selector); w[41] = __byte_perm (w[26], w[27], selector); w[40] = __byte_perm (w[25], w[26], selector); w[39] = __byte_perm (w[24], w[25], selector); w[38] = __byte_perm (w[23], w[24], selector); w[37] = __byte_perm (w[22], w[23], selector); w[36] = __byte_perm (w[21], w[22], selector); w[35] = __byte_perm (w[20], w[21], selector); w[34] = __byte_perm (w[19], w[20], selector); w[33] = __byte_perm (w[18], w[19], selector); w[32] = __byte_perm (w[17], w[18], selector); w[31] = __byte_perm (w[16], w[17], selector); w[30] = __byte_perm (w[15], w[16], selector); w[29] = __byte_perm (w[14], w[15], selector); w[28] = __byte_perm (w[13], w[14], selector); w[27] = __byte_perm (w[12], w[13], selector); w[26] = __byte_perm (w[11], w[12], selector); w[25] = __byte_perm (w[10], w[11], selector); w[24] = __byte_perm (w[ 9], w[10], selector); w[23] = __byte_perm (w[ 8], w[ 9], selector); w[22] = __byte_perm (w[ 7], w[ 8], selector); w[21] = __byte_perm (w[ 6], w[ 7], selector); w[20] = __byte_perm (w[ 5], w[ 6], selector); w[19] = __byte_perm (w[ 4], w[ 5], selector); w[18] = __byte_perm (w[ 3], w[ 4], selector); w[17] = __byte_perm (w[ 2], w[ 3], selector); w[16] = __byte_perm (w[ 1], w[ 2], selector); w[15] = __byte_perm (w[ 0], w[ 1], selector); w[14] = __byte_perm ( 0, w[ 0], selector); w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 15: w[63] = __byte_perm (w[47], w[48], selector); w[62] = __byte_perm (w[46], w[47], selector); w[61] = __byte_perm (w[45], w[46], selector); w[60] = __byte_perm (w[44], w[45], selector); w[59] = __byte_perm (w[43], w[44], selector); w[58] = __byte_perm (w[42], w[43], selector); w[57] = __byte_perm (w[41], w[42], selector); w[56] = __byte_perm (w[40], w[41], selector); w[55] = __byte_perm (w[39], w[40], selector); w[54] = __byte_perm (w[38], w[39], selector); w[53] = __byte_perm (w[37], w[38], selector); w[52] = __byte_perm (w[36], w[37], selector); w[51] = __byte_perm (w[35], w[36], selector); w[50] = __byte_perm (w[34], w[35], selector); w[49] = __byte_perm (w[33], w[34], selector); w[48] = __byte_perm (w[32], w[33], selector); w[47] = __byte_perm (w[31], w[32], selector); w[46] = __byte_perm (w[30], w[31], selector); w[45] = __byte_perm (w[29], w[30], selector); w[44] = __byte_perm (w[28], w[29], selector); w[43] = __byte_perm (w[27], w[28], selector); w[42] = __byte_perm (w[26], w[27], selector); w[41] = __byte_perm (w[25], w[26], selector); w[40] = __byte_perm (w[24], w[25], selector); w[39] = __byte_perm (w[23], w[24], selector); w[38] = __byte_perm (w[22], w[23], selector); w[37] = __byte_perm (w[21], w[22], selector); w[36] = __byte_perm (w[20], w[21], selector); w[35] = __byte_perm (w[19], w[20], selector); w[34] = __byte_perm (w[18], w[19], selector); w[33] = __byte_perm (w[17], w[18], selector); w[32] = __byte_perm (w[16], w[17], selector); w[31] = __byte_perm (w[15], w[16], selector); w[30] = __byte_perm (w[14], w[15], selector); w[29] = __byte_perm (w[13], w[14], selector); w[28] = __byte_perm (w[12], w[13], selector); w[27] = __byte_perm (w[11], w[12], selector); w[26] = __byte_perm (w[10], w[11], selector); w[25] = __byte_perm (w[ 9], w[10], selector); w[24] = __byte_perm (w[ 8], w[ 9], selector); w[23] = __byte_perm (w[ 7], w[ 8], selector); w[22] = __byte_perm (w[ 6], w[ 7], selector); w[21] = __byte_perm (w[ 5], w[ 6], selector); w[20] = __byte_perm (w[ 4], w[ 5], selector); w[19] = __byte_perm (w[ 3], w[ 4], selector); w[18] = __byte_perm (w[ 2], w[ 3], selector); w[17] = __byte_perm (w[ 1], w[ 2], selector); w[16] = __byte_perm (w[ 0], w[ 1], selector); w[15] = __byte_perm ( 0, w[ 0], selector); w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 16: w[63] = __byte_perm (w[46], w[47], selector); w[62] = __byte_perm (w[45], w[46], selector); w[61] = __byte_perm (w[44], w[45], selector); w[60] = __byte_perm (w[43], w[44], selector); w[59] = __byte_perm (w[42], w[43], selector); w[58] = __byte_perm (w[41], w[42], selector); w[57] = __byte_perm (w[40], w[41], selector); w[56] = __byte_perm (w[39], w[40], selector); w[55] = __byte_perm (w[38], w[39], selector); w[54] = __byte_perm (w[37], w[38], selector); w[53] = __byte_perm (w[36], w[37], selector); w[52] = __byte_perm (w[35], w[36], selector); w[51] = __byte_perm (w[34], w[35], selector); w[50] = __byte_perm (w[33], w[34], selector); w[49] = __byte_perm (w[32], w[33], selector); w[48] = __byte_perm (w[31], w[32], selector); w[47] = __byte_perm (w[30], w[31], selector); w[46] = __byte_perm (w[29], w[30], selector); w[45] = __byte_perm (w[28], w[29], selector); w[44] = __byte_perm (w[27], w[28], selector); w[43] = __byte_perm (w[26], w[27], selector); w[42] = __byte_perm (w[25], w[26], selector); w[41] = __byte_perm (w[24], w[25], selector); w[40] = __byte_perm (w[23], w[24], selector); w[39] = __byte_perm (w[22], w[23], selector); w[38] = __byte_perm (w[21], w[22], selector); w[37] = __byte_perm (w[20], w[21], selector); w[36] = __byte_perm (w[19], w[20], selector); w[35] = __byte_perm (w[18], w[19], selector); w[34] = __byte_perm (w[17], w[18], selector); w[33] = __byte_perm (w[16], w[17], selector); w[32] = __byte_perm (w[15], w[16], selector); w[31] = __byte_perm (w[14], w[15], selector); w[30] = __byte_perm (w[13], w[14], selector); w[29] = __byte_perm (w[12], w[13], selector); w[28] = __byte_perm (w[11], w[12], selector); w[27] = __byte_perm (w[10], w[11], selector); w[26] = __byte_perm (w[ 9], w[10], selector); w[25] = __byte_perm (w[ 8], w[ 9], selector); w[24] = __byte_perm (w[ 7], w[ 8], selector); w[23] = __byte_perm (w[ 6], w[ 7], selector); w[22] = __byte_perm (w[ 5], w[ 6], selector); w[21] = __byte_perm (w[ 4], w[ 5], selector); w[20] = __byte_perm (w[ 3], w[ 4], selector); w[19] = __byte_perm (w[ 2], w[ 3], selector); w[18] = __byte_perm (w[ 1], w[ 2], selector); w[17] = __byte_perm (w[ 0], w[ 1], selector); w[16] = __byte_perm ( 0, w[ 0], selector); w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 17: w[63] = __byte_perm (w[45], w[46], selector); w[62] = __byte_perm (w[44], w[45], selector); w[61] = __byte_perm (w[43], w[44], selector); w[60] = __byte_perm (w[42], w[43], selector); w[59] = __byte_perm (w[41], w[42], selector); w[58] = __byte_perm (w[40], w[41], selector); w[57] = __byte_perm (w[39], w[40], selector); w[56] = __byte_perm (w[38], w[39], selector); w[55] = __byte_perm (w[37], w[38], selector); w[54] = __byte_perm (w[36], w[37], selector); w[53] = __byte_perm (w[35], w[36], selector); w[52] = __byte_perm (w[34], w[35], selector); w[51] = __byte_perm (w[33], w[34], selector); w[50] = __byte_perm (w[32], w[33], selector); w[49] = __byte_perm (w[31], w[32], selector); w[48] = __byte_perm (w[30], w[31], selector); w[47] = __byte_perm (w[29], w[30], selector); w[46] = __byte_perm (w[28], w[29], selector); w[45] = __byte_perm (w[27], w[28], selector); w[44] = __byte_perm (w[26], w[27], selector); w[43] = __byte_perm (w[25], w[26], selector); w[42] = __byte_perm (w[24], w[25], selector); w[41] = __byte_perm (w[23], w[24], selector); w[40] = __byte_perm (w[22], w[23], selector); w[39] = __byte_perm (w[21], w[22], selector); w[38] = __byte_perm (w[20], w[21], selector); w[37] = __byte_perm (w[19], w[20], selector); w[36] = __byte_perm (w[18], w[19], selector); w[35] = __byte_perm (w[17], w[18], selector); w[34] = __byte_perm (w[16], w[17], selector); w[33] = __byte_perm (w[15], w[16], selector); w[32] = __byte_perm (w[14], w[15], selector); w[31] = __byte_perm (w[13], w[14], selector); w[30] = __byte_perm (w[12], w[13], selector); w[29] = __byte_perm (w[11], w[12], selector); w[28] = __byte_perm (w[10], w[11], selector); w[27] = __byte_perm (w[ 9], w[10], selector); w[26] = __byte_perm (w[ 8], w[ 9], selector); w[25] = __byte_perm (w[ 7], w[ 8], selector); w[24] = __byte_perm (w[ 6], w[ 7], selector); w[23] = __byte_perm (w[ 5], w[ 6], selector); w[22] = __byte_perm (w[ 4], w[ 5], selector); w[21] = __byte_perm (w[ 3], w[ 4], selector); w[20] = __byte_perm (w[ 2], w[ 3], selector); w[19] = __byte_perm (w[ 1], w[ 2], selector); w[18] = __byte_perm (w[ 0], w[ 1], selector); w[17] = __byte_perm ( 0, w[ 0], selector); w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 18: w[63] = __byte_perm (w[44], w[45], selector); w[62] = __byte_perm (w[43], w[44], selector); w[61] = __byte_perm (w[42], w[43], selector); w[60] = __byte_perm (w[41], w[42], selector); w[59] = __byte_perm (w[40], w[41], selector); w[58] = __byte_perm (w[39], w[40], selector); w[57] = __byte_perm (w[38], w[39], selector); w[56] = __byte_perm (w[37], w[38], selector); w[55] = __byte_perm (w[36], w[37], selector); w[54] = __byte_perm (w[35], w[36], selector); w[53] = __byte_perm (w[34], w[35], selector); w[52] = __byte_perm (w[33], w[34], selector); w[51] = __byte_perm (w[32], w[33], selector); w[50] = __byte_perm (w[31], w[32], selector); w[49] = __byte_perm (w[30], w[31], selector); w[48] = __byte_perm (w[29], w[30], selector); w[47] = __byte_perm (w[28], w[29], selector); w[46] = __byte_perm (w[27], w[28], selector); w[45] = __byte_perm (w[26], w[27], selector); w[44] = __byte_perm (w[25], w[26], selector); w[43] = __byte_perm (w[24], w[25], selector); w[42] = __byte_perm (w[23], w[24], selector); w[41] = __byte_perm (w[22], w[23], selector); w[40] = __byte_perm (w[21], w[22], selector); w[39] = __byte_perm (w[20], w[21], selector); w[38] = __byte_perm (w[19], w[20], selector); w[37] = __byte_perm (w[18], w[19], selector); w[36] = __byte_perm (w[17], w[18], selector); w[35] = __byte_perm (w[16], w[17], selector); w[34] = __byte_perm (w[15], w[16], selector); w[33] = __byte_perm (w[14], w[15], selector); w[32] = __byte_perm (w[13], w[14], selector); w[31] = __byte_perm (w[12], w[13], selector); w[30] = __byte_perm (w[11], w[12], selector); w[29] = __byte_perm (w[10], w[11], selector); w[28] = __byte_perm (w[ 9], w[10], selector); w[27] = __byte_perm (w[ 8], w[ 9], selector); w[26] = __byte_perm (w[ 7], w[ 8], selector); w[25] = __byte_perm (w[ 6], w[ 7], selector); w[24] = __byte_perm (w[ 5], w[ 6], selector); w[23] = __byte_perm (w[ 4], w[ 5], selector); w[22] = __byte_perm (w[ 3], w[ 4], selector); w[21] = __byte_perm (w[ 2], w[ 3], selector); w[20] = __byte_perm (w[ 1], w[ 2], selector); w[19] = __byte_perm (w[ 0], w[ 1], selector); w[18] = __byte_perm ( 0, w[ 0], selector); w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 19: w[63] = __byte_perm (w[43], w[44], selector); w[62] = __byte_perm (w[42], w[43], selector); w[61] = __byte_perm (w[41], w[42], selector); w[60] = __byte_perm (w[40], w[41], selector); w[59] = __byte_perm (w[39], w[40], selector); w[58] = __byte_perm (w[38], w[39], selector); w[57] = __byte_perm (w[37], w[38], selector); w[56] = __byte_perm (w[36], w[37], selector); w[55] = __byte_perm (w[35], w[36], selector); w[54] = __byte_perm (w[34], w[35], selector); w[53] = __byte_perm (w[33], w[34], selector); w[52] = __byte_perm (w[32], w[33], selector); w[51] = __byte_perm (w[31], w[32], selector); w[50] = __byte_perm (w[30], w[31], selector); w[49] = __byte_perm (w[29], w[30], selector); w[48] = __byte_perm (w[28], w[29], selector); w[47] = __byte_perm (w[27], w[28], selector); w[46] = __byte_perm (w[26], w[27], selector); w[45] = __byte_perm (w[25], w[26], selector); w[44] = __byte_perm (w[24], w[25], selector); w[43] = __byte_perm (w[23], w[24], selector); w[42] = __byte_perm (w[22], w[23], selector); w[41] = __byte_perm (w[21], w[22], selector); w[40] = __byte_perm (w[20], w[21], selector); w[39] = __byte_perm (w[19], w[20], selector); w[38] = __byte_perm (w[18], w[19], selector); w[37] = __byte_perm (w[17], w[18], selector); w[36] = __byte_perm (w[16], w[17], selector); w[35] = __byte_perm (w[15], w[16], selector); w[34] = __byte_perm (w[14], w[15], selector); w[33] = __byte_perm (w[13], w[14], selector); w[32] = __byte_perm (w[12], w[13], selector); w[31] = __byte_perm (w[11], w[12], selector); w[30] = __byte_perm (w[10], w[11], selector); w[29] = __byte_perm (w[ 9], w[10], selector); w[28] = __byte_perm (w[ 8], w[ 9], selector); w[27] = __byte_perm (w[ 7], w[ 8], selector); w[26] = __byte_perm (w[ 6], w[ 7], selector); w[25] = __byte_perm (w[ 5], w[ 6], selector); w[24] = __byte_perm (w[ 4], w[ 5], selector); w[23] = __byte_perm (w[ 3], w[ 4], selector); w[22] = __byte_perm (w[ 2], w[ 3], selector); w[21] = __byte_perm (w[ 1], w[ 2], selector); w[20] = __byte_perm (w[ 0], w[ 1], selector); w[19] = __byte_perm ( 0, w[ 0], selector); w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 20: w[63] = __byte_perm (w[42], w[43], selector); w[62] = __byte_perm (w[41], w[42], selector); w[61] = __byte_perm (w[40], w[41], selector); w[60] = __byte_perm (w[39], w[40], selector); w[59] = __byte_perm (w[38], w[39], selector); w[58] = __byte_perm (w[37], w[38], selector); w[57] = __byte_perm (w[36], w[37], selector); w[56] = __byte_perm (w[35], w[36], selector); w[55] = __byte_perm (w[34], w[35], selector); w[54] = __byte_perm (w[33], w[34], selector); w[53] = __byte_perm (w[32], w[33], selector); w[52] = __byte_perm (w[31], w[32], selector); w[51] = __byte_perm (w[30], w[31], selector); w[50] = __byte_perm (w[29], w[30], selector); w[49] = __byte_perm (w[28], w[29], selector); w[48] = __byte_perm (w[27], w[28], selector); w[47] = __byte_perm (w[26], w[27], selector); w[46] = __byte_perm (w[25], w[26], selector); w[45] = __byte_perm (w[24], w[25], selector); w[44] = __byte_perm (w[23], w[24], selector); w[43] = __byte_perm (w[22], w[23], selector); w[42] = __byte_perm (w[21], w[22], selector); w[41] = __byte_perm (w[20], w[21], selector); w[40] = __byte_perm (w[19], w[20], selector); w[39] = __byte_perm (w[18], w[19], selector); w[38] = __byte_perm (w[17], w[18], selector); w[37] = __byte_perm (w[16], w[17], selector); w[36] = __byte_perm (w[15], w[16], selector); w[35] = __byte_perm (w[14], w[15], selector); w[34] = __byte_perm (w[13], w[14], selector); w[33] = __byte_perm (w[12], w[13], selector); w[32] = __byte_perm (w[11], w[12], selector); w[31] = __byte_perm (w[10], w[11], selector); w[30] = __byte_perm (w[ 9], w[10], selector); w[29] = __byte_perm (w[ 8], w[ 9], selector); w[28] = __byte_perm (w[ 7], w[ 8], selector); w[27] = __byte_perm (w[ 6], w[ 7], selector); w[26] = __byte_perm (w[ 5], w[ 6], selector); w[25] = __byte_perm (w[ 4], w[ 5], selector); w[24] = __byte_perm (w[ 3], w[ 4], selector); w[23] = __byte_perm (w[ 2], w[ 3], selector); w[22] = __byte_perm (w[ 1], w[ 2], selector); w[21] = __byte_perm (w[ 0], w[ 1], selector); w[20] = __byte_perm ( 0, w[ 0], selector); w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 21: w[63] = __byte_perm (w[41], w[42], selector); w[62] = __byte_perm (w[40], w[41], selector); w[61] = __byte_perm (w[39], w[40], selector); w[60] = __byte_perm (w[38], w[39], selector); w[59] = __byte_perm (w[37], w[38], selector); w[58] = __byte_perm (w[36], w[37], selector); w[57] = __byte_perm (w[35], w[36], selector); w[56] = __byte_perm (w[34], w[35], selector); w[55] = __byte_perm (w[33], w[34], selector); w[54] = __byte_perm (w[32], w[33], selector); w[53] = __byte_perm (w[31], w[32], selector); w[52] = __byte_perm (w[30], w[31], selector); w[51] = __byte_perm (w[29], w[30], selector); w[50] = __byte_perm (w[28], w[29], selector); w[49] = __byte_perm (w[27], w[28], selector); w[48] = __byte_perm (w[26], w[27], selector); w[47] = __byte_perm (w[25], w[26], selector); w[46] = __byte_perm (w[24], w[25], selector); w[45] = __byte_perm (w[23], w[24], selector); w[44] = __byte_perm (w[22], w[23], selector); w[43] = __byte_perm (w[21], w[22], selector); w[42] = __byte_perm (w[20], w[21], selector); w[41] = __byte_perm (w[19], w[20], selector); w[40] = __byte_perm (w[18], w[19], selector); w[39] = __byte_perm (w[17], w[18], selector); w[38] = __byte_perm (w[16], w[17], selector); w[37] = __byte_perm (w[15], w[16], selector); w[36] = __byte_perm (w[14], w[15], selector); w[35] = __byte_perm (w[13], w[14], selector); w[34] = __byte_perm (w[12], w[13], selector); w[33] = __byte_perm (w[11], w[12], selector); w[32] = __byte_perm (w[10], w[11], selector); w[31] = __byte_perm (w[ 9], w[10], selector); w[30] = __byte_perm (w[ 8], w[ 9], selector); w[29] = __byte_perm (w[ 7], w[ 8], selector); w[28] = __byte_perm (w[ 6], w[ 7], selector); w[27] = __byte_perm (w[ 5], w[ 6], selector); w[26] = __byte_perm (w[ 4], w[ 5], selector); w[25] = __byte_perm (w[ 3], w[ 4], selector); w[24] = __byte_perm (w[ 2], w[ 3], selector); w[23] = __byte_perm (w[ 1], w[ 2], selector); w[22] = __byte_perm (w[ 0], w[ 1], selector); w[21] = __byte_perm ( 0, w[ 0], selector); w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 22: w[63] = __byte_perm (w[40], w[41], selector); w[62] = __byte_perm (w[39], w[40], selector); w[61] = __byte_perm (w[38], w[39], selector); w[60] = __byte_perm (w[37], w[38], selector); w[59] = __byte_perm (w[36], w[37], selector); w[58] = __byte_perm (w[35], w[36], selector); w[57] = __byte_perm (w[34], w[35], selector); w[56] = __byte_perm (w[33], w[34], selector); w[55] = __byte_perm (w[32], w[33], selector); w[54] = __byte_perm (w[31], w[32], selector); w[53] = __byte_perm (w[30], w[31], selector); w[52] = __byte_perm (w[29], w[30], selector); w[51] = __byte_perm (w[28], w[29], selector); w[50] = __byte_perm (w[27], w[28], selector); w[49] = __byte_perm (w[26], w[27], selector); w[48] = __byte_perm (w[25], w[26], selector); w[47] = __byte_perm (w[24], w[25], selector); w[46] = __byte_perm (w[23], w[24], selector); w[45] = __byte_perm (w[22], w[23], selector); w[44] = __byte_perm (w[21], w[22], selector); w[43] = __byte_perm (w[20], w[21], selector); w[42] = __byte_perm (w[19], w[20], selector); w[41] = __byte_perm (w[18], w[19], selector); w[40] = __byte_perm (w[17], w[18], selector); w[39] = __byte_perm (w[16], w[17], selector); w[38] = __byte_perm (w[15], w[16], selector); w[37] = __byte_perm (w[14], w[15], selector); w[36] = __byte_perm (w[13], w[14], selector); w[35] = __byte_perm (w[12], w[13], selector); w[34] = __byte_perm (w[11], w[12], selector); w[33] = __byte_perm (w[10], w[11], selector); w[32] = __byte_perm (w[ 9], w[10], selector); w[31] = __byte_perm (w[ 8], w[ 9], selector); w[30] = __byte_perm (w[ 7], w[ 8], selector); w[29] = __byte_perm (w[ 6], w[ 7], selector); w[28] = __byte_perm (w[ 5], w[ 6], selector); w[27] = __byte_perm (w[ 4], w[ 5], selector); w[26] = __byte_perm (w[ 3], w[ 4], selector); w[25] = __byte_perm (w[ 2], w[ 3], selector); w[24] = __byte_perm (w[ 1], w[ 2], selector); w[23] = __byte_perm (w[ 0], w[ 1], selector); w[22] = __byte_perm ( 0, w[ 0], selector); w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 23: w[63] = __byte_perm (w[39], w[40], selector); w[62] = __byte_perm (w[38], w[39], selector); w[61] = __byte_perm (w[37], w[38], selector); w[60] = __byte_perm (w[36], w[37], selector); w[59] = __byte_perm (w[35], w[36], selector); w[58] = __byte_perm (w[34], w[35], selector); w[57] = __byte_perm (w[33], w[34], selector); w[56] = __byte_perm (w[32], w[33], selector); w[55] = __byte_perm (w[31], w[32], selector); w[54] = __byte_perm (w[30], w[31], selector); w[53] = __byte_perm (w[29], w[30], selector); w[52] = __byte_perm (w[28], w[29], selector); w[51] = __byte_perm (w[27], w[28], selector); w[50] = __byte_perm (w[26], w[27], selector); w[49] = __byte_perm (w[25], w[26], selector); w[48] = __byte_perm (w[24], w[25], selector); w[47] = __byte_perm (w[23], w[24], selector); w[46] = __byte_perm (w[22], w[23], selector); w[45] = __byte_perm (w[21], w[22], selector); w[44] = __byte_perm (w[20], w[21], selector); w[43] = __byte_perm (w[19], w[20], selector); w[42] = __byte_perm (w[18], w[19], selector); w[41] = __byte_perm (w[17], w[18], selector); w[40] = __byte_perm (w[16], w[17], selector); w[39] = __byte_perm (w[15], w[16], selector); w[38] = __byte_perm (w[14], w[15], selector); w[37] = __byte_perm (w[13], w[14], selector); w[36] = __byte_perm (w[12], w[13], selector); w[35] = __byte_perm (w[11], w[12], selector); w[34] = __byte_perm (w[10], w[11], selector); w[33] = __byte_perm (w[ 9], w[10], selector); w[32] = __byte_perm (w[ 8], w[ 9], selector); w[31] = __byte_perm (w[ 7], w[ 8], selector); w[30] = __byte_perm (w[ 6], w[ 7], selector); w[29] = __byte_perm (w[ 5], w[ 6], selector); w[28] = __byte_perm (w[ 4], w[ 5], selector); w[27] = __byte_perm (w[ 3], w[ 4], selector); w[26] = __byte_perm (w[ 2], w[ 3], selector); w[25] = __byte_perm (w[ 1], w[ 2], selector); w[24] = __byte_perm (w[ 0], w[ 1], selector); w[23] = __byte_perm ( 0, w[ 0], selector); w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 24: w[63] = __byte_perm (w[38], w[39], selector); w[62] = __byte_perm (w[37], w[38], selector); w[61] = __byte_perm (w[36], w[37], selector); w[60] = __byte_perm (w[35], w[36], selector); w[59] = __byte_perm (w[34], w[35], selector); w[58] = __byte_perm (w[33], w[34], selector); w[57] = __byte_perm (w[32], w[33], selector); w[56] = __byte_perm (w[31], w[32], selector); w[55] = __byte_perm (w[30], w[31], selector); w[54] = __byte_perm (w[29], w[30], selector); w[53] = __byte_perm (w[28], w[29], selector); w[52] = __byte_perm (w[27], w[28], selector); w[51] = __byte_perm (w[26], w[27], selector); w[50] = __byte_perm (w[25], w[26], selector); w[49] = __byte_perm (w[24], w[25], selector); w[48] = __byte_perm (w[23], w[24], selector); w[47] = __byte_perm (w[22], w[23], selector); w[46] = __byte_perm (w[21], w[22], selector); w[45] = __byte_perm (w[20], w[21], selector); w[44] = __byte_perm (w[19], w[20], selector); w[43] = __byte_perm (w[18], w[19], selector); w[42] = __byte_perm (w[17], w[18], selector); w[41] = __byte_perm (w[16], w[17], selector); w[40] = __byte_perm (w[15], w[16], selector); w[39] = __byte_perm (w[14], w[15], selector); w[38] = __byte_perm (w[13], w[14], selector); w[37] = __byte_perm (w[12], w[13], selector); w[36] = __byte_perm (w[11], w[12], selector); w[35] = __byte_perm (w[10], w[11], selector); w[34] = __byte_perm (w[ 9], w[10], selector); w[33] = __byte_perm (w[ 8], w[ 9], selector); w[32] = __byte_perm (w[ 7], w[ 8], selector); w[31] = __byte_perm (w[ 6], w[ 7], selector); w[30] = __byte_perm (w[ 5], w[ 6], selector); w[29] = __byte_perm (w[ 4], w[ 5], selector); w[28] = __byte_perm (w[ 3], w[ 4], selector); w[27] = __byte_perm (w[ 2], w[ 3], selector); w[26] = __byte_perm (w[ 1], w[ 2], selector); w[25] = __byte_perm (w[ 0], w[ 1], selector); w[24] = __byte_perm ( 0, w[ 0], selector); w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 25: w[63] = __byte_perm (w[37], w[38], selector); w[62] = __byte_perm (w[36], w[37], selector); w[61] = __byte_perm (w[35], w[36], selector); w[60] = __byte_perm (w[34], w[35], selector); w[59] = __byte_perm (w[33], w[34], selector); w[58] = __byte_perm (w[32], w[33], selector); w[57] = __byte_perm (w[31], w[32], selector); w[56] = __byte_perm (w[30], w[31], selector); w[55] = __byte_perm (w[29], w[30], selector); w[54] = __byte_perm (w[28], w[29], selector); w[53] = __byte_perm (w[27], w[28], selector); w[52] = __byte_perm (w[26], w[27], selector); w[51] = __byte_perm (w[25], w[26], selector); w[50] = __byte_perm (w[24], w[25], selector); w[49] = __byte_perm (w[23], w[24], selector); w[48] = __byte_perm (w[22], w[23], selector); w[47] = __byte_perm (w[21], w[22], selector); w[46] = __byte_perm (w[20], w[21], selector); w[45] = __byte_perm (w[19], w[20], selector); w[44] = __byte_perm (w[18], w[19], selector); w[43] = __byte_perm (w[17], w[18], selector); w[42] = __byte_perm (w[16], w[17], selector); w[41] = __byte_perm (w[15], w[16], selector); w[40] = __byte_perm (w[14], w[15], selector); w[39] = __byte_perm (w[13], w[14], selector); w[38] = __byte_perm (w[12], w[13], selector); w[37] = __byte_perm (w[11], w[12], selector); w[36] = __byte_perm (w[10], w[11], selector); w[35] = __byte_perm (w[ 9], w[10], selector); w[34] = __byte_perm (w[ 8], w[ 9], selector); w[33] = __byte_perm (w[ 7], w[ 8], selector); w[32] = __byte_perm (w[ 6], w[ 7], selector); w[31] = __byte_perm (w[ 5], w[ 6], selector); w[30] = __byte_perm (w[ 4], w[ 5], selector); w[29] = __byte_perm (w[ 3], w[ 4], selector); w[28] = __byte_perm (w[ 2], w[ 3], selector); w[27] = __byte_perm (w[ 1], w[ 2], selector); w[26] = __byte_perm (w[ 0], w[ 1], selector); w[25] = __byte_perm ( 0, w[ 0], selector); w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 26: w[63] = __byte_perm (w[36], w[37], selector); w[62] = __byte_perm (w[35], w[36], selector); w[61] = __byte_perm (w[34], w[35], selector); w[60] = __byte_perm (w[33], w[34], selector); w[59] = __byte_perm (w[32], w[33], selector); w[58] = __byte_perm (w[31], w[32], selector); w[57] = __byte_perm (w[30], w[31], selector); w[56] = __byte_perm (w[29], w[30], selector); w[55] = __byte_perm (w[28], w[29], selector); w[54] = __byte_perm (w[27], w[28], selector); w[53] = __byte_perm (w[26], w[27], selector); w[52] = __byte_perm (w[25], w[26], selector); w[51] = __byte_perm (w[24], w[25], selector); w[50] = __byte_perm (w[23], w[24], selector); w[49] = __byte_perm (w[22], w[23], selector); w[48] = __byte_perm (w[21], w[22], selector); w[47] = __byte_perm (w[20], w[21], selector); w[46] = __byte_perm (w[19], w[20], selector); w[45] = __byte_perm (w[18], w[19], selector); w[44] = __byte_perm (w[17], w[18], selector); w[43] = __byte_perm (w[16], w[17], selector); w[42] = __byte_perm (w[15], w[16], selector); w[41] = __byte_perm (w[14], w[15], selector); w[40] = __byte_perm (w[13], w[14], selector); w[39] = __byte_perm (w[12], w[13], selector); w[38] = __byte_perm (w[11], w[12], selector); w[37] = __byte_perm (w[10], w[11], selector); w[36] = __byte_perm (w[ 9], w[10], selector); w[35] = __byte_perm (w[ 8], w[ 9], selector); w[34] = __byte_perm (w[ 7], w[ 8], selector); w[33] = __byte_perm (w[ 6], w[ 7], selector); w[32] = __byte_perm (w[ 5], w[ 6], selector); w[31] = __byte_perm (w[ 4], w[ 5], selector); w[30] = __byte_perm (w[ 3], w[ 4], selector); w[29] = __byte_perm (w[ 2], w[ 3], selector); w[28] = __byte_perm (w[ 1], w[ 2], selector); w[27] = __byte_perm (w[ 0], w[ 1], selector); w[26] = __byte_perm ( 0, w[ 0], selector); w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 27: w[63] = __byte_perm (w[35], w[36], selector); w[62] = __byte_perm (w[34], w[35], selector); w[61] = __byte_perm (w[33], w[34], selector); w[60] = __byte_perm (w[32], w[33], selector); w[59] = __byte_perm (w[31], w[32], selector); w[58] = __byte_perm (w[30], w[31], selector); w[57] = __byte_perm (w[29], w[30], selector); w[56] = __byte_perm (w[28], w[29], selector); w[55] = __byte_perm (w[27], w[28], selector); w[54] = __byte_perm (w[26], w[27], selector); w[53] = __byte_perm (w[25], w[26], selector); w[52] = __byte_perm (w[24], w[25], selector); w[51] = __byte_perm (w[23], w[24], selector); w[50] = __byte_perm (w[22], w[23], selector); w[49] = __byte_perm (w[21], w[22], selector); w[48] = __byte_perm (w[20], w[21], selector); w[47] = __byte_perm (w[19], w[20], selector); w[46] = __byte_perm (w[18], w[19], selector); w[45] = __byte_perm (w[17], w[18], selector); w[44] = __byte_perm (w[16], w[17], selector); w[43] = __byte_perm (w[15], w[16], selector); w[42] = __byte_perm (w[14], w[15], selector); w[41] = __byte_perm (w[13], w[14], selector); w[40] = __byte_perm (w[12], w[13], selector); w[39] = __byte_perm (w[11], w[12], selector); w[38] = __byte_perm (w[10], w[11], selector); w[37] = __byte_perm (w[ 9], w[10], selector); w[36] = __byte_perm (w[ 8], w[ 9], selector); w[35] = __byte_perm (w[ 7], w[ 8], selector); w[34] = __byte_perm (w[ 6], w[ 7], selector); w[33] = __byte_perm (w[ 5], w[ 6], selector); w[32] = __byte_perm (w[ 4], w[ 5], selector); w[31] = __byte_perm (w[ 3], w[ 4], selector); w[30] = __byte_perm (w[ 2], w[ 3], selector); w[29] = __byte_perm (w[ 1], w[ 2], selector); w[28] = __byte_perm (w[ 0], w[ 1], selector); w[27] = __byte_perm ( 0, w[ 0], selector); w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 28: w[63] = __byte_perm (w[34], w[35], selector); w[62] = __byte_perm (w[33], w[34], selector); w[61] = __byte_perm (w[32], w[33], selector); w[60] = __byte_perm (w[31], w[32], selector); w[59] = __byte_perm (w[30], w[31], selector); w[58] = __byte_perm (w[29], w[30], selector); w[57] = __byte_perm (w[28], w[29], selector); w[56] = __byte_perm (w[27], w[28], selector); w[55] = __byte_perm (w[26], w[27], selector); w[54] = __byte_perm (w[25], w[26], selector); w[53] = __byte_perm (w[24], w[25], selector); w[52] = __byte_perm (w[23], w[24], selector); w[51] = __byte_perm (w[22], w[23], selector); w[50] = __byte_perm (w[21], w[22], selector); w[49] = __byte_perm (w[20], w[21], selector); w[48] = __byte_perm (w[19], w[20], selector); w[47] = __byte_perm (w[18], w[19], selector); w[46] = __byte_perm (w[17], w[18], selector); w[45] = __byte_perm (w[16], w[17], selector); w[44] = __byte_perm (w[15], w[16], selector); w[43] = __byte_perm (w[14], w[15], selector); w[42] = __byte_perm (w[13], w[14], selector); w[41] = __byte_perm (w[12], w[13], selector); w[40] = __byte_perm (w[11], w[12], selector); w[39] = __byte_perm (w[10], w[11], selector); w[38] = __byte_perm (w[ 9], w[10], selector); w[37] = __byte_perm (w[ 8], w[ 9], selector); w[36] = __byte_perm (w[ 7], w[ 8], selector); w[35] = __byte_perm (w[ 6], w[ 7], selector); w[34] = __byte_perm (w[ 5], w[ 6], selector); w[33] = __byte_perm (w[ 4], w[ 5], selector); w[32] = __byte_perm (w[ 3], w[ 4], selector); w[31] = __byte_perm (w[ 2], w[ 3], selector); w[30] = __byte_perm (w[ 1], w[ 2], selector); w[29] = __byte_perm (w[ 0], w[ 1], selector); w[28] = __byte_perm ( 0, w[ 0], selector); w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 29: w[63] = __byte_perm (w[33], w[34], selector); w[62] = __byte_perm (w[32], w[33], selector); w[61] = __byte_perm (w[31], w[32], selector); w[60] = __byte_perm (w[30], w[31], selector); w[59] = __byte_perm (w[29], w[30], selector); w[58] = __byte_perm (w[28], w[29], selector); w[57] = __byte_perm (w[27], w[28], selector); w[56] = __byte_perm (w[26], w[27], selector); w[55] = __byte_perm (w[25], w[26], selector); w[54] = __byte_perm (w[24], w[25], selector); w[53] = __byte_perm (w[23], w[24], selector); w[52] = __byte_perm (w[22], w[23], selector); w[51] = __byte_perm (w[21], w[22], selector); w[50] = __byte_perm (w[20], w[21], selector); w[49] = __byte_perm (w[19], w[20], selector); w[48] = __byte_perm (w[18], w[19], selector); w[47] = __byte_perm (w[17], w[18], selector); w[46] = __byte_perm (w[16], w[17], selector); w[45] = __byte_perm (w[15], w[16], selector); w[44] = __byte_perm (w[14], w[15], selector); w[43] = __byte_perm (w[13], w[14], selector); w[42] = __byte_perm (w[12], w[13], selector); w[41] = __byte_perm (w[11], w[12], selector); w[40] = __byte_perm (w[10], w[11], selector); w[39] = __byte_perm (w[ 9], w[10], selector); w[38] = __byte_perm (w[ 8], w[ 9], selector); w[37] = __byte_perm (w[ 7], w[ 8], selector); w[36] = __byte_perm (w[ 6], w[ 7], selector); w[35] = __byte_perm (w[ 5], w[ 6], selector); w[34] = __byte_perm (w[ 4], w[ 5], selector); w[33] = __byte_perm (w[ 3], w[ 4], selector); w[32] = __byte_perm (w[ 2], w[ 3], selector); w[31] = __byte_perm (w[ 1], w[ 2], selector); w[30] = __byte_perm (w[ 0], w[ 1], selector); w[29] = __byte_perm ( 0, w[ 0], selector); w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 30: w[63] = __byte_perm (w[32], w[33], selector); w[62] = __byte_perm (w[31], w[32], selector); w[61] = __byte_perm (w[30], w[31], selector); w[60] = __byte_perm (w[29], w[30], selector); w[59] = __byte_perm (w[28], w[29], selector); w[58] = __byte_perm (w[27], w[28], selector); w[57] = __byte_perm (w[26], w[27], selector); w[56] = __byte_perm (w[25], w[26], selector); w[55] = __byte_perm (w[24], w[25], selector); w[54] = __byte_perm (w[23], w[24], selector); w[53] = __byte_perm (w[22], w[23], selector); w[52] = __byte_perm (w[21], w[22], selector); w[51] = __byte_perm (w[20], w[21], selector); w[50] = __byte_perm (w[19], w[20], selector); w[49] = __byte_perm (w[18], w[19], selector); w[48] = __byte_perm (w[17], w[18], selector); w[47] = __byte_perm (w[16], w[17], selector); w[46] = __byte_perm (w[15], w[16], selector); w[45] = __byte_perm (w[14], w[15], selector); w[44] = __byte_perm (w[13], w[14], selector); w[43] = __byte_perm (w[12], w[13], selector); w[42] = __byte_perm (w[11], w[12], selector); w[41] = __byte_perm (w[10], w[11], selector); w[40] = __byte_perm (w[ 9], w[10], selector); w[39] = __byte_perm (w[ 8], w[ 9], selector); w[38] = __byte_perm (w[ 7], w[ 8], selector); w[37] = __byte_perm (w[ 6], w[ 7], selector); w[36] = __byte_perm (w[ 5], w[ 6], selector); w[35] = __byte_perm (w[ 4], w[ 5], selector); w[34] = __byte_perm (w[ 3], w[ 4], selector); w[33] = __byte_perm (w[ 2], w[ 3], selector); w[32] = __byte_perm (w[ 1], w[ 2], selector); w[31] = __byte_perm (w[ 0], w[ 1], selector); w[30] = __byte_perm ( 0, w[ 0], selector); w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 31: w[63] = __byte_perm (w[31], w[32], selector); w[62] = __byte_perm (w[30], w[31], selector); w[61] = __byte_perm (w[29], w[30], selector); w[60] = __byte_perm (w[28], w[29], selector); w[59] = __byte_perm (w[27], w[28], selector); w[58] = __byte_perm (w[26], w[27], selector); w[57] = __byte_perm (w[25], w[26], selector); w[56] = __byte_perm (w[24], w[25], selector); w[55] = __byte_perm (w[23], w[24], selector); w[54] = __byte_perm (w[22], w[23], selector); w[53] = __byte_perm (w[21], w[22], selector); w[52] = __byte_perm (w[20], w[21], selector); w[51] = __byte_perm (w[19], w[20], selector); w[50] = __byte_perm (w[18], w[19], selector); w[49] = __byte_perm (w[17], w[18], selector); w[48] = __byte_perm (w[16], w[17], selector); w[47] = __byte_perm (w[15], w[16], selector); w[46] = __byte_perm (w[14], w[15], selector); w[45] = __byte_perm (w[13], w[14], selector); w[44] = __byte_perm (w[12], w[13], selector); w[43] = __byte_perm (w[11], w[12], selector); w[42] = __byte_perm (w[10], w[11], selector); w[41] = __byte_perm (w[ 9], w[10], selector); w[40] = __byte_perm (w[ 8], w[ 9], selector); w[39] = __byte_perm (w[ 7], w[ 8], selector); w[38] = __byte_perm (w[ 6], w[ 7], selector); w[37] = __byte_perm (w[ 5], w[ 6], selector); w[36] = __byte_perm (w[ 4], w[ 5], selector); w[35] = __byte_perm (w[ 3], w[ 4], selector); w[34] = __byte_perm (w[ 2], w[ 3], selector); w[33] = __byte_perm (w[ 1], w[ 2], selector); w[32] = __byte_perm (w[ 0], w[ 1], selector); w[31] = __byte_perm ( 0, w[ 0], selector); w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 32: w[63] = __byte_perm (w[30], w[31], selector); w[62] = __byte_perm (w[29], w[30], selector); w[61] = __byte_perm (w[28], w[29], selector); w[60] = __byte_perm (w[27], w[28], selector); w[59] = __byte_perm (w[26], w[27], selector); w[58] = __byte_perm (w[25], w[26], selector); w[57] = __byte_perm (w[24], w[25], selector); w[56] = __byte_perm (w[23], w[24], selector); w[55] = __byte_perm (w[22], w[23], selector); w[54] = __byte_perm (w[21], w[22], selector); w[53] = __byte_perm (w[20], w[21], selector); w[52] = __byte_perm (w[19], w[20], selector); w[51] = __byte_perm (w[18], w[19], selector); w[50] = __byte_perm (w[17], w[18], selector); w[49] = __byte_perm (w[16], w[17], selector); w[48] = __byte_perm (w[15], w[16], selector); w[47] = __byte_perm (w[14], w[15], selector); w[46] = __byte_perm (w[13], w[14], selector); w[45] = __byte_perm (w[12], w[13], selector); w[44] = __byte_perm (w[11], w[12], selector); w[43] = __byte_perm (w[10], w[11], selector); w[42] = __byte_perm (w[ 9], w[10], selector); w[41] = __byte_perm (w[ 8], w[ 9], selector); w[40] = __byte_perm (w[ 7], w[ 8], selector); w[39] = __byte_perm (w[ 6], w[ 7], selector); w[38] = __byte_perm (w[ 5], w[ 6], selector); w[37] = __byte_perm (w[ 4], w[ 5], selector); w[36] = __byte_perm (w[ 3], w[ 4], selector); w[35] = __byte_perm (w[ 2], w[ 3], selector); w[34] = __byte_perm (w[ 1], w[ 2], selector); w[33] = __byte_perm (w[ 0], w[ 1], selector); w[32] = __byte_perm ( 0, w[ 0], selector); w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 33: w[63] = __byte_perm (w[29], w[30], selector); w[62] = __byte_perm (w[28], w[29], selector); w[61] = __byte_perm (w[27], w[28], selector); w[60] = __byte_perm (w[26], w[27], selector); w[59] = __byte_perm (w[25], w[26], selector); w[58] = __byte_perm (w[24], w[25], selector); w[57] = __byte_perm (w[23], w[24], selector); w[56] = __byte_perm (w[22], w[23], selector); w[55] = __byte_perm (w[21], w[22], selector); w[54] = __byte_perm (w[20], w[21], selector); w[53] = __byte_perm (w[19], w[20], selector); w[52] = __byte_perm (w[18], w[19], selector); w[51] = __byte_perm (w[17], w[18], selector); w[50] = __byte_perm (w[16], w[17], selector); w[49] = __byte_perm (w[15], w[16], selector); w[48] = __byte_perm (w[14], w[15], selector); w[47] = __byte_perm (w[13], w[14], selector); w[46] = __byte_perm (w[12], w[13], selector); w[45] = __byte_perm (w[11], w[12], selector); w[44] = __byte_perm (w[10], w[11], selector); w[43] = __byte_perm (w[ 9], w[10], selector); w[42] = __byte_perm (w[ 8], w[ 9], selector); w[41] = __byte_perm (w[ 7], w[ 8], selector); w[40] = __byte_perm (w[ 6], w[ 7], selector); w[39] = __byte_perm (w[ 5], w[ 6], selector); w[38] = __byte_perm (w[ 4], w[ 5], selector); w[37] = __byte_perm (w[ 3], w[ 4], selector); w[36] = __byte_perm (w[ 2], w[ 3], selector); w[35] = __byte_perm (w[ 1], w[ 2], selector); w[34] = __byte_perm (w[ 0], w[ 1], selector); w[33] = __byte_perm ( 0, w[ 0], selector); w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 34: w[63] = __byte_perm (w[28], w[29], selector); w[62] = __byte_perm (w[27], w[28], selector); w[61] = __byte_perm (w[26], w[27], selector); w[60] = __byte_perm (w[25], w[26], selector); w[59] = __byte_perm (w[24], w[25], selector); w[58] = __byte_perm (w[23], w[24], selector); w[57] = __byte_perm (w[22], w[23], selector); w[56] = __byte_perm (w[21], w[22], selector); w[55] = __byte_perm (w[20], w[21], selector); w[54] = __byte_perm (w[19], w[20], selector); w[53] = __byte_perm (w[18], w[19], selector); w[52] = __byte_perm (w[17], w[18], selector); w[51] = __byte_perm (w[16], w[17], selector); w[50] = __byte_perm (w[15], w[16], selector); w[49] = __byte_perm (w[14], w[15], selector); w[48] = __byte_perm (w[13], w[14], selector); w[47] = __byte_perm (w[12], w[13], selector); w[46] = __byte_perm (w[11], w[12], selector); w[45] = __byte_perm (w[10], w[11], selector); w[44] = __byte_perm (w[ 9], w[10], selector); w[43] = __byte_perm (w[ 8], w[ 9], selector); w[42] = __byte_perm (w[ 7], w[ 8], selector); w[41] = __byte_perm (w[ 6], w[ 7], selector); w[40] = __byte_perm (w[ 5], w[ 6], selector); w[39] = __byte_perm (w[ 4], w[ 5], selector); w[38] = __byte_perm (w[ 3], w[ 4], selector); w[37] = __byte_perm (w[ 2], w[ 3], selector); w[36] = __byte_perm (w[ 1], w[ 2], selector); w[35] = __byte_perm (w[ 0], w[ 1], selector); w[34] = __byte_perm ( 0, w[ 0], selector); w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 35: w[63] = __byte_perm (w[27], w[28], selector); w[62] = __byte_perm (w[26], w[27], selector); w[61] = __byte_perm (w[25], w[26], selector); w[60] = __byte_perm (w[24], w[25], selector); w[59] = __byte_perm (w[23], w[24], selector); w[58] = __byte_perm (w[22], w[23], selector); w[57] = __byte_perm (w[21], w[22], selector); w[56] = __byte_perm (w[20], w[21], selector); w[55] = __byte_perm (w[19], w[20], selector); w[54] = __byte_perm (w[18], w[19], selector); w[53] = __byte_perm (w[17], w[18], selector); w[52] = __byte_perm (w[16], w[17], selector); w[51] = __byte_perm (w[15], w[16], selector); w[50] = __byte_perm (w[14], w[15], selector); w[49] = __byte_perm (w[13], w[14], selector); w[48] = __byte_perm (w[12], w[13], selector); w[47] = __byte_perm (w[11], w[12], selector); w[46] = __byte_perm (w[10], w[11], selector); w[45] = __byte_perm (w[ 9], w[10], selector); w[44] = __byte_perm (w[ 8], w[ 9], selector); w[43] = __byte_perm (w[ 7], w[ 8], selector); w[42] = __byte_perm (w[ 6], w[ 7], selector); w[41] = __byte_perm (w[ 5], w[ 6], selector); w[40] = __byte_perm (w[ 4], w[ 5], selector); w[39] = __byte_perm (w[ 3], w[ 4], selector); w[38] = __byte_perm (w[ 2], w[ 3], selector); w[37] = __byte_perm (w[ 1], w[ 2], selector); w[36] = __byte_perm (w[ 0], w[ 1], selector); w[35] = __byte_perm ( 0, w[ 0], selector); w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 36: w[63] = __byte_perm (w[26], w[27], selector); w[62] = __byte_perm (w[25], w[26], selector); w[61] = __byte_perm (w[24], w[25], selector); w[60] = __byte_perm (w[23], w[24], selector); w[59] = __byte_perm (w[22], w[23], selector); w[58] = __byte_perm (w[21], w[22], selector); w[57] = __byte_perm (w[20], w[21], selector); w[56] = __byte_perm (w[19], w[20], selector); w[55] = __byte_perm (w[18], w[19], selector); w[54] = __byte_perm (w[17], w[18], selector); w[53] = __byte_perm (w[16], w[17], selector); w[52] = __byte_perm (w[15], w[16], selector); w[51] = __byte_perm (w[14], w[15], selector); w[50] = __byte_perm (w[13], w[14], selector); w[49] = __byte_perm (w[12], w[13], selector); w[48] = __byte_perm (w[11], w[12], selector); w[47] = __byte_perm (w[10], w[11], selector); w[46] = __byte_perm (w[ 9], w[10], selector); w[45] = __byte_perm (w[ 8], w[ 9], selector); w[44] = __byte_perm (w[ 7], w[ 8], selector); w[43] = __byte_perm (w[ 6], w[ 7], selector); w[42] = __byte_perm (w[ 5], w[ 6], selector); w[41] = __byte_perm (w[ 4], w[ 5], selector); w[40] = __byte_perm (w[ 3], w[ 4], selector); w[39] = __byte_perm (w[ 2], w[ 3], selector); w[38] = __byte_perm (w[ 1], w[ 2], selector); w[37] = __byte_perm (w[ 0], w[ 1], selector); w[36] = __byte_perm ( 0, w[ 0], selector); w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 37: w[63] = __byte_perm (w[25], w[26], selector); w[62] = __byte_perm (w[24], w[25], selector); w[61] = __byte_perm (w[23], w[24], selector); w[60] = __byte_perm (w[22], w[23], selector); w[59] = __byte_perm (w[21], w[22], selector); w[58] = __byte_perm (w[20], w[21], selector); w[57] = __byte_perm (w[19], w[20], selector); w[56] = __byte_perm (w[18], w[19], selector); w[55] = __byte_perm (w[17], w[18], selector); w[54] = __byte_perm (w[16], w[17], selector); w[53] = __byte_perm (w[15], w[16], selector); w[52] = __byte_perm (w[14], w[15], selector); w[51] = __byte_perm (w[13], w[14], selector); w[50] = __byte_perm (w[12], w[13], selector); w[49] = __byte_perm (w[11], w[12], selector); w[48] = __byte_perm (w[10], w[11], selector); w[47] = __byte_perm (w[ 9], w[10], selector); w[46] = __byte_perm (w[ 8], w[ 9], selector); w[45] = __byte_perm (w[ 7], w[ 8], selector); w[44] = __byte_perm (w[ 6], w[ 7], selector); w[43] = __byte_perm (w[ 5], w[ 6], selector); w[42] = __byte_perm (w[ 4], w[ 5], selector); w[41] = __byte_perm (w[ 3], w[ 4], selector); w[40] = __byte_perm (w[ 2], w[ 3], selector); w[39] = __byte_perm (w[ 1], w[ 2], selector); w[38] = __byte_perm (w[ 0], w[ 1], selector); w[37] = __byte_perm ( 0, w[ 0], selector); w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 38: w[63] = __byte_perm (w[24], w[25], selector); w[62] = __byte_perm (w[23], w[24], selector); w[61] = __byte_perm (w[22], w[23], selector); w[60] = __byte_perm (w[21], w[22], selector); w[59] = __byte_perm (w[20], w[21], selector); w[58] = __byte_perm (w[19], w[20], selector); w[57] = __byte_perm (w[18], w[19], selector); w[56] = __byte_perm (w[17], w[18], selector); w[55] = __byte_perm (w[16], w[17], selector); w[54] = __byte_perm (w[15], w[16], selector); w[53] = __byte_perm (w[14], w[15], selector); w[52] = __byte_perm (w[13], w[14], selector); w[51] = __byte_perm (w[12], w[13], selector); w[50] = __byte_perm (w[11], w[12], selector); w[49] = __byte_perm (w[10], w[11], selector); w[48] = __byte_perm (w[ 9], w[10], selector); w[47] = __byte_perm (w[ 8], w[ 9], selector); w[46] = __byte_perm (w[ 7], w[ 8], selector); w[45] = __byte_perm (w[ 6], w[ 7], selector); w[44] = __byte_perm (w[ 5], w[ 6], selector); w[43] = __byte_perm (w[ 4], w[ 5], selector); w[42] = __byte_perm (w[ 3], w[ 4], selector); w[41] = __byte_perm (w[ 2], w[ 3], selector); w[40] = __byte_perm (w[ 1], w[ 2], selector); w[39] = __byte_perm (w[ 0], w[ 1], selector); w[38] = __byte_perm ( 0, w[ 0], selector); w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 39: w[63] = __byte_perm (w[23], w[24], selector); w[62] = __byte_perm (w[22], w[23], selector); w[61] = __byte_perm (w[21], w[22], selector); w[60] = __byte_perm (w[20], w[21], selector); w[59] = __byte_perm (w[19], w[20], selector); w[58] = __byte_perm (w[18], w[19], selector); w[57] = __byte_perm (w[17], w[18], selector); w[56] = __byte_perm (w[16], w[17], selector); w[55] = __byte_perm (w[15], w[16], selector); w[54] = __byte_perm (w[14], w[15], selector); w[53] = __byte_perm (w[13], w[14], selector); w[52] = __byte_perm (w[12], w[13], selector); w[51] = __byte_perm (w[11], w[12], selector); w[50] = __byte_perm (w[10], w[11], selector); w[49] = __byte_perm (w[ 9], w[10], selector); w[48] = __byte_perm (w[ 8], w[ 9], selector); w[47] = __byte_perm (w[ 7], w[ 8], selector); w[46] = __byte_perm (w[ 6], w[ 7], selector); w[45] = __byte_perm (w[ 5], w[ 6], selector); w[44] = __byte_perm (w[ 4], w[ 5], selector); w[43] = __byte_perm (w[ 3], w[ 4], selector); w[42] = __byte_perm (w[ 2], w[ 3], selector); w[41] = __byte_perm (w[ 1], w[ 2], selector); w[40] = __byte_perm (w[ 0], w[ 1], selector); w[39] = __byte_perm ( 0, w[ 0], selector); w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 40: w[63] = __byte_perm (w[22], w[23], selector); w[62] = __byte_perm (w[21], w[22], selector); w[61] = __byte_perm (w[20], w[21], selector); w[60] = __byte_perm (w[19], w[20], selector); w[59] = __byte_perm (w[18], w[19], selector); w[58] = __byte_perm (w[17], w[18], selector); w[57] = __byte_perm (w[16], w[17], selector); w[56] = __byte_perm (w[15], w[16], selector); w[55] = __byte_perm (w[14], w[15], selector); w[54] = __byte_perm (w[13], w[14], selector); w[53] = __byte_perm (w[12], w[13], selector); w[52] = __byte_perm (w[11], w[12], selector); w[51] = __byte_perm (w[10], w[11], selector); w[50] = __byte_perm (w[ 9], w[10], selector); w[49] = __byte_perm (w[ 8], w[ 9], selector); w[48] = __byte_perm (w[ 7], w[ 8], selector); w[47] = __byte_perm (w[ 6], w[ 7], selector); w[46] = __byte_perm (w[ 5], w[ 6], selector); w[45] = __byte_perm (w[ 4], w[ 5], selector); w[44] = __byte_perm (w[ 3], w[ 4], selector); w[43] = __byte_perm (w[ 2], w[ 3], selector); w[42] = __byte_perm (w[ 1], w[ 2], selector); w[41] = __byte_perm (w[ 0], w[ 1], selector); w[40] = __byte_perm ( 0, w[ 0], selector); w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 41: w[63] = __byte_perm (w[21], w[22], selector); w[62] = __byte_perm (w[20], w[21], selector); w[61] = __byte_perm (w[19], w[20], selector); w[60] = __byte_perm (w[18], w[19], selector); w[59] = __byte_perm (w[17], w[18], selector); w[58] = __byte_perm (w[16], w[17], selector); w[57] = __byte_perm (w[15], w[16], selector); w[56] = __byte_perm (w[14], w[15], selector); w[55] = __byte_perm (w[13], w[14], selector); w[54] = __byte_perm (w[12], w[13], selector); w[53] = __byte_perm (w[11], w[12], selector); w[52] = __byte_perm (w[10], w[11], selector); w[51] = __byte_perm (w[ 9], w[10], selector); w[50] = __byte_perm (w[ 8], w[ 9], selector); w[49] = __byte_perm (w[ 7], w[ 8], selector); w[48] = __byte_perm (w[ 6], w[ 7], selector); w[47] = __byte_perm (w[ 5], w[ 6], selector); w[46] = __byte_perm (w[ 4], w[ 5], selector); w[45] = __byte_perm (w[ 3], w[ 4], selector); w[44] = __byte_perm (w[ 2], w[ 3], selector); w[43] = __byte_perm (w[ 1], w[ 2], selector); w[42] = __byte_perm (w[ 0], w[ 1], selector); w[41] = __byte_perm ( 0, w[ 0], selector); w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 42: w[63] = __byte_perm (w[20], w[21], selector); w[62] = __byte_perm (w[19], w[20], selector); w[61] = __byte_perm (w[18], w[19], selector); w[60] = __byte_perm (w[17], w[18], selector); w[59] = __byte_perm (w[16], w[17], selector); w[58] = __byte_perm (w[15], w[16], selector); w[57] = __byte_perm (w[14], w[15], selector); w[56] = __byte_perm (w[13], w[14], selector); w[55] = __byte_perm (w[12], w[13], selector); w[54] = __byte_perm (w[11], w[12], selector); w[53] = __byte_perm (w[10], w[11], selector); w[52] = __byte_perm (w[ 9], w[10], selector); w[51] = __byte_perm (w[ 8], w[ 9], selector); w[50] = __byte_perm (w[ 7], w[ 8], selector); w[49] = __byte_perm (w[ 6], w[ 7], selector); w[48] = __byte_perm (w[ 5], w[ 6], selector); w[47] = __byte_perm (w[ 4], w[ 5], selector); w[46] = __byte_perm (w[ 3], w[ 4], selector); w[45] = __byte_perm (w[ 2], w[ 3], selector); w[44] = __byte_perm (w[ 1], w[ 2], selector); w[43] = __byte_perm (w[ 0], w[ 1], selector); w[42] = __byte_perm ( 0, w[ 0], selector); w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 43: w[63] = __byte_perm (w[19], w[20], selector); w[62] = __byte_perm (w[18], w[19], selector); w[61] = __byte_perm (w[17], w[18], selector); w[60] = __byte_perm (w[16], w[17], selector); w[59] = __byte_perm (w[15], w[16], selector); w[58] = __byte_perm (w[14], w[15], selector); w[57] = __byte_perm (w[13], w[14], selector); w[56] = __byte_perm (w[12], w[13], selector); w[55] = __byte_perm (w[11], w[12], selector); w[54] = __byte_perm (w[10], w[11], selector); w[53] = __byte_perm (w[ 9], w[10], selector); w[52] = __byte_perm (w[ 8], w[ 9], selector); w[51] = __byte_perm (w[ 7], w[ 8], selector); w[50] = __byte_perm (w[ 6], w[ 7], selector); w[49] = __byte_perm (w[ 5], w[ 6], selector); w[48] = __byte_perm (w[ 4], w[ 5], selector); w[47] = __byte_perm (w[ 3], w[ 4], selector); w[46] = __byte_perm (w[ 2], w[ 3], selector); w[45] = __byte_perm (w[ 1], w[ 2], selector); w[44] = __byte_perm (w[ 0], w[ 1], selector); w[43] = __byte_perm ( 0, w[ 0], selector); w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 44: w[63] = __byte_perm (w[18], w[19], selector); w[62] = __byte_perm (w[17], w[18], selector); w[61] = __byte_perm (w[16], w[17], selector); w[60] = __byte_perm (w[15], w[16], selector); w[59] = __byte_perm (w[14], w[15], selector); w[58] = __byte_perm (w[13], w[14], selector); w[57] = __byte_perm (w[12], w[13], selector); w[56] = __byte_perm (w[11], w[12], selector); w[55] = __byte_perm (w[10], w[11], selector); w[54] = __byte_perm (w[ 9], w[10], selector); w[53] = __byte_perm (w[ 8], w[ 9], selector); w[52] = __byte_perm (w[ 7], w[ 8], selector); w[51] = __byte_perm (w[ 6], w[ 7], selector); w[50] = __byte_perm (w[ 5], w[ 6], selector); w[49] = __byte_perm (w[ 4], w[ 5], selector); w[48] = __byte_perm (w[ 3], w[ 4], selector); w[47] = __byte_perm (w[ 2], w[ 3], selector); w[46] = __byte_perm (w[ 1], w[ 2], selector); w[45] = __byte_perm (w[ 0], w[ 1], selector); w[44] = __byte_perm ( 0, w[ 0], selector); w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 45: w[63] = __byte_perm (w[17], w[18], selector); w[62] = __byte_perm (w[16], w[17], selector); w[61] = __byte_perm (w[15], w[16], selector); w[60] = __byte_perm (w[14], w[15], selector); w[59] = __byte_perm (w[13], w[14], selector); w[58] = __byte_perm (w[12], w[13], selector); w[57] = __byte_perm (w[11], w[12], selector); w[56] = __byte_perm (w[10], w[11], selector); w[55] = __byte_perm (w[ 9], w[10], selector); w[54] = __byte_perm (w[ 8], w[ 9], selector); w[53] = __byte_perm (w[ 7], w[ 8], selector); w[52] = __byte_perm (w[ 6], w[ 7], selector); w[51] = __byte_perm (w[ 5], w[ 6], selector); w[50] = __byte_perm (w[ 4], w[ 5], selector); w[49] = __byte_perm (w[ 3], w[ 4], selector); w[48] = __byte_perm (w[ 2], w[ 3], selector); w[47] = __byte_perm (w[ 1], w[ 2], selector); w[46] = __byte_perm (w[ 0], w[ 1], selector); w[45] = __byte_perm ( 0, w[ 0], selector); w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 46: w[63] = __byte_perm (w[16], w[17], selector); w[62] = __byte_perm (w[15], w[16], selector); w[61] = __byte_perm (w[14], w[15], selector); w[60] = __byte_perm (w[13], w[14], selector); w[59] = __byte_perm (w[12], w[13], selector); w[58] = __byte_perm (w[11], w[12], selector); w[57] = __byte_perm (w[10], w[11], selector); w[56] = __byte_perm (w[ 9], w[10], selector); w[55] = __byte_perm (w[ 8], w[ 9], selector); w[54] = __byte_perm (w[ 7], w[ 8], selector); w[53] = __byte_perm (w[ 6], w[ 7], selector); w[52] = __byte_perm (w[ 5], w[ 6], selector); w[51] = __byte_perm (w[ 4], w[ 5], selector); w[50] = __byte_perm (w[ 3], w[ 4], selector); w[49] = __byte_perm (w[ 2], w[ 3], selector); w[48] = __byte_perm (w[ 1], w[ 2], selector); w[47] = __byte_perm (w[ 0], w[ 1], selector); w[46] = __byte_perm ( 0, w[ 0], selector); w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 47: w[63] = __byte_perm (w[15], w[16], selector); w[62] = __byte_perm (w[14], w[15], selector); w[61] = __byte_perm (w[13], w[14], selector); w[60] = __byte_perm (w[12], w[13], selector); w[59] = __byte_perm (w[11], w[12], selector); w[58] = __byte_perm (w[10], w[11], selector); w[57] = __byte_perm (w[ 9], w[10], selector); w[56] = __byte_perm (w[ 8], w[ 9], selector); w[55] = __byte_perm (w[ 7], w[ 8], selector); w[54] = __byte_perm (w[ 6], w[ 7], selector); w[53] = __byte_perm (w[ 5], w[ 6], selector); w[52] = __byte_perm (w[ 4], w[ 5], selector); w[51] = __byte_perm (w[ 3], w[ 4], selector); w[50] = __byte_perm (w[ 2], w[ 3], selector); w[49] = __byte_perm (w[ 1], w[ 2], selector); w[48] = __byte_perm (w[ 0], w[ 1], selector); w[47] = __byte_perm ( 0, w[ 0], selector); w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 48: w[63] = __byte_perm (w[14], w[15], selector); w[62] = __byte_perm (w[13], w[14], selector); w[61] = __byte_perm (w[12], w[13], selector); w[60] = __byte_perm (w[11], w[12], selector); w[59] = __byte_perm (w[10], w[11], selector); w[58] = __byte_perm (w[ 9], w[10], selector); w[57] = __byte_perm (w[ 8], w[ 9], selector); w[56] = __byte_perm (w[ 7], w[ 8], selector); w[55] = __byte_perm (w[ 6], w[ 7], selector); w[54] = __byte_perm (w[ 5], w[ 6], selector); w[53] = __byte_perm (w[ 4], w[ 5], selector); w[52] = __byte_perm (w[ 3], w[ 4], selector); w[51] = __byte_perm (w[ 2], w[ 3], selector); w[50] = __byte_perm (w[ 1], w[ 2], selector); w[49] = __byte_perm (w[ 0], w[ 1], selector); w[48] = __byte_perm ( 0, w[ 0], selector); w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 49: w[63] = __byte_perm (w[13], w[14], selector); w[62] = __byte_perm (w[12], w[13], selector); w[61] = __byte_perm (w[11], w[12], selector); w[60] = __byte_perm (w[10], w[11], selector); w[59] = __byte_perm (w[ 9], w[10], selector); w[58] = __byte_perm (w[ 8], w[ 9], selector); w[57] = __byte_perm (w[ 7], w[ 8], selector); w[56] = __byte_perm (w[ 6], w[ 7], selector); w[55] = __byte_perm (w[ 5], w[ 6], selector); w[54] = __byte_perm (w[ 4], w[ 5], selector); w[53] = __byte_perm (w[ 3], w[ 4], selector); w[52] = __byte_perm (w[ 2], w[ 3], selector); w[51] = __byte_perm (w[ 1], w[ 2], selector); w[50] = __byte_perm (w[ 0], w[ 1], selector); w[49] = __byte_perm ( 0, w[ 0], selector); w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 50: w[63] = __byte_perm (w[12], w[13], selector); w[62] = __byte_perm (w[11], w[12], selector); w[61] = __byte_perm (w[10], w[11], selector); w[60] = __byte_perm (w[ 9], w[10], selector); w[59] = __byte_perm (w[ 8], w[ 9], selector); w[58] = __byte_perm (w[ 7], w[ 8], selector); w[57] = __byte_perm (w[ 6], w[ 7], selector); w[56] = __byte_perm (w[ 5], w[ 6], selector); w[55] = __byte_perm (w[ 4], w[ 5], selector); w[54] = __byte_perm (w[ 3], w[ 4], selector); w[53] = __byte_perm (w[ 2], w[ 3], selector); w[52] = __byte_perm (w[ 1], w[ 2], selector); w[51] = __byte_perm (w[ 0], w[ 1], selector); w[50] = __byte_perm ( 0, w[ 0], selector); w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 51: w[63] = __byte_perm (w[11], w[12], selector); w[62] = __byte_perm (w[10], w[11], selector); w[61] = __byte_perm (w[ 9], w[10], selector); w[60] = __byte_perm (w[ 8], w[ 9], selector); w[59] = __byte_perm (w[ 7], w[ 8], selector); w[58] = __byte_perm (w[ 6], w[ 7], selector); w[57] = __byte_perm (w[ 5], w[ 6], selector); w[56] = __byte_perm (w[ 4], w[ 5], selector); w[55] = __byte_perm (w[ 3], w[ 4], selector); w[54] = __byte_perm (w[ 2], w[ 3], selector); w[53] = __byte_perm (w[ 1], w[ 2], selector); w[52] = __byte_perm (w[ 0], w[ 1], selector); w[51] = __byte_perm ( 0, w[ 0], selector); w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 52: w[63] = __byte_perm (w[10], w[11], selector); w[62] = __byte_perm (w[ 9], w[10], selector); w[61] = __byte_perm (w[ 8], w[ 9], selector); w[60] = __byte_perm (w[ 7], w[ 8], selector); w[59] = __byte_perm (w[ 6], w[ 7], selector); w[58] = __byte_perm (w[ 5], w[ 6], selector); w[57] = __byte_perm (w[ 4], w[ 5], selector); w[56] = __byte_perm (w[ 3], w[ 4], selector); w[55] = __byte_perm (w[ 2], w[ 3], selector); w[54] = __byte_perm (w[ 1], w[ 2], selector); w[53] = __byte_perm (w[ 0], w[ 1], selector); w[52] = __byte_perm ( 0, w[ 0], selector); w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 53: w[63] = __byte_perm (w[ 9], w[10], selector); w[62] = __byte_perm (w[ 8], w[ 9], selector); w[61] = __byte_perm (w[ 7], w[ 8], selector); w[60] = __byte_perm (w[ 6], w[ 7], selector); w[59] = __byte_perm (w[ 5], w[ 6], selector); w[58] = __byte_perm (w[ 4], w[ 5], selector); w[57] = __byte_perm (w[ 3], w[ 4], selector); w[56] = __byte_perm (w[ 2], w[ 3], selector); w[55] = __byte_perm (w[ 1], w[ 2], selector); w[54] = __byte_perm (w[ 0], w[ 1], selector); w[53] = __byte_perm ( 0, w[ 0], selector); w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 54: w[63] = __byte_perm (w[ 8], w[ 9], selector); w[62] = __byte_perm (w[ 7], w[ 8], selector); w[61] = __byte_perm (w[ 6], w[ 7], selector); w[60] = __byte_perm (w[ 5], w[ 6], selector); w[59] = __byte_perm (w[ 4], w[ 5], selector); w[58] = __byte_perm (w[ 3], w[ 4], selector); w[57] = __byte_perm (w[ 2], w[ 3], selector); w[56] = __byte_perm (w[ 1], w[ 2], selector); w[55] = __byte_perm (w[ 0], w[ 1], selector); w[54] = __byte_perm ( 0, w[ 0], selector); w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 55: w[63] = __byte_perm (w[ 7], w[ 8], selector); w[62] = __byte_perm (w[ 6], w[ 7], selector); w[61] = __byte_perm (w[ 5], w[ 6], selector); w[60] = __byte_perm (w[ 4], w[ 5], selector); w[59] = __byte_perm (w[ 3], w[ 4], selector); w[58] = __byte_perm (w[ 2], w[ 3], selector); w[57] = __byte_perm (w[ 1], w[ 2], selector); w[56] = __byte_perm (w[ 0], w[ 1], selector); w[55] = __byte_perm ( 0, w[ 0], selector); w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 56: w[63] = __byte_perm (w[ 6], w[ 7], selector); w[62] = __byte_perm (w[ 5], w[ 6], selector); w[61] = __byte_perm (w[ 4], w[ 5], selector); w[60] = __byte_perm (w[ 3], w[ 4], selector); w[59] = __byte_perm (w[ 2], w[ 3], selector); w[58] = __byte_perm (w[ 1], w[ 2], selector); w[57] = __byte_perm (w[ 0], w[ 1], selector); w[56] = __byte_perm ( 0, w[ 0], selector); w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 57: w[63] = __byte_perm (w[ 5], w[ 6], selector); w[62] = __byte_perm (w[ 4], w[ 5], selector); w[61] = __byte_perm (w[ 3], w[ 4], selector); w[60] = __byte_perm (w[ 2], w[ 3], selector); w[59] = __byte_perm (w[ 1], w[ 2], selector); w[58] = __byte_perm (w[ 0], w[ 1], selector); w[57] = __byte_perm ( 0, w[ 0], selector); w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 58: w[63] = __byte_perm (w[ 4], w[ 5], selector); w[62] = __byte_perm (w[ 3], w[ 4], selector); w[61] = __byte_perm (w[ 2], w[ 3], selector); w[60] = __byte_perm (w[ 1], w[ 2], selector); w[59] = __byte_perm (w[ 0], w[ 1], selector); w[58] = __byte_perm ( 0, w[ 0], selector); w[57] = 0; w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 59: w[63] = __byte_perm (w[ 3], w[ 4], selector); w[62] = __byte_perm (w[ 2], w[ 3], selector); w[61] = __byte_perm (w[ 1], w[ 2], selector); w[60] = __byte_perm (w[ 0], w[ 1], selector); w[59] = __byte_perm ( 0, w[ 0], selector); w[58] = 0; w[57] = 0; w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 60: w[63] = __byte_perm (w[ 2], w[ 3], selector); w[62] = __byte_perm (w[ 1], w[ 2], selector); w[61] = __byte_perm (w[ 0], w[ 1], selector); w[60] = __byte_perm ( 0, w[ 0], selector); w[59] = 0; w[58] = 0; w[57] = 0; w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 61: w[63] = __byte_perm (w[ 1], w[ 2], selector); w[62] = __byte_perm (w[ 0], w[ 1], selector); w[61] = __byte_perm ( 0, w[ 0], selector); w[60] = 0; w[59] = 0; w[58] = 0; w[57] = 0; w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 62: w[63] = __byte_perm (w[ 0], w[ 1], selector); w[62] = __byte_perm ( 0, w[ 0], selector); w[61] = 0; w[60] = 0; w[59] = 0; w[58] = 0; w[57] = 0; w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 63: w[63] = __byte_perm ( 0, w[ 0], selector); w[62] = 0; w[61] = 0; w[60] = 0; w[59] = 0; w[58] = 0; w[57] = 0; w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; } #endif } DECLSPEC void switch_buffer_by_offset_1x64_be (u32x w[64], const u32 offset) { const int offset_switch = offset / 4; #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC switch (offset_switch) { case 0: w[63] = amd_bytealign (w[62], w[63], offset); w[62] = amd_bytealign (w[61], w[62], offset); w[61] = amd_bytealign (w[60], w[61], offset); w[60] = amd_bytealign (w[59], w[60], offset); w[59] = amd_bytealign (w[58], w[59], offset); w[58] = amd_bytealign (w[57], w[58], offset); w[57] = amd_bytealign (w[56], w[57], offset); w[56] = amd_bytealign (w[55], w[56], offset); w[55] = amd_bytealign (w[54], w[55], offset); w[54] = amd_bytealign (w[53], w[54], offset); w[53] = amd_bytealign (w[52], w[53], offset); w[52] = amd_bytealign (w[51], w[52], offset); w[51] = amd_bytealign (w[50], w[51], offset); w[50] = amd_bytealign (w[49], w[50], offset); w[49] = amd_bytealign (w[48], w[49], offset); w[48] = amd_bytealign (w[47], w[48], offset); w[47] = amd_bytealign (w[46], w[47], offset); w[46] = amd_bytealign (w[45], w[46], offset); w[45] = amd_bytealign (w[44], w[45], offset); w[44] = amd_bytealign (w[43], w[44], offset); w[43] = amd_bytealign (w[42], w[43], offset); w[42] = amd_bytealign (w[41], w[42], offset); w[41] = amd_bytealign (w[40], w[41], offset); w[40] = amd_bytealign (w[39], w[40], offset); w[39] = amd_bytealign (w[38], w[39], offset); w[38] = amd_bytealign (w[37], w[38], offset); w[37] = amd_bytealign (w[36], w[37], offset); w[36] = amd_bytealign (w[35], w[36], offset); w[35] = amd_bytealign (w[34], w[35], offset); w[34] = amd_bytealign (w[33], w[34], offset); w[33] = amd_bytealign (w[32], w[33], offset); w[32] = amd_bytealign (w[31], w[32], offset); w[31] = amd_bytealign (w[30], w[31], offset); w[30] = amd_bytealign (w[29], w[30], offset); w[29] = amd_bytealign (w[28], w[29], offset); w[28] = amd_bytealign (w[27], w[28], offset); w[27] = amd_bytealign (w[26], w[27], offset); w[26] = amd_bytealign (w[25], w[26], offset); w[25] = amd_bytealign (w[24], w[25], offset); w[24] = amd_bytealign (w[23], w[24], offset); w[23] = amd_bytealign (w[22], w[23], offset); w[22] = amd_bytealign (w[21], w[22], offset); w[21] = amd_bytealign (w[20], w[21], offset); w[20] = amd_bytealign (w[19], w[20], offset); w[19] = amd_bytealign (w[18], w[19], offset); w[18] = amd_bytealign (w[17], w[18], offset); w[17] = amd_bytealign (w[16], w[17], offset); w[16] = amd_bytealign (w[15], w[16], offset); w[15] = amd_bytealign (w[14], w[15], offset); w[14] = amd_bytealign (w[13], w[14], offset); w[13] = amd_bytealign (w[12], w[13], offset); w[12] = amd_bytealign (w[11], w[12], offset); w[11] = amd_bytealign (w[10], w[11], offset); w[10] = amd_bytealign (w[ 9], w[10], offset); w[ 9] = amd_bytealign (w[ 8], w[ 9], offset); w[ 8] = amd_bytealign (w[ 7], w[ 8], offset); w[ 7] = amd_bytealign (w[ 6], w[ 7], offset); w[ 6] = amd_bytealign (w[ 5], w[ 6], offset); w[ 5] = amd_bytealign (w[ 4], w[ 5], offset); w[ 4] = amd_bytealign (w[ 3], w[ 4], offset); w[ 3] = amd_bytealign (w[ 2], w[ 3], offset); w[ 2] = amd_bytealign (w[ 1], w[ 2], offset); w[ 1] = amd_bytealign (w[ 0], w[ 1], offset); w[ 0] = amd_bytealign ( 0, w[ 0], offset); break; case 1: w[63] = amd_bytealign (w[61], w[62], offset); w[62] = amd_bytealign (w[60], w[61], offset); w[61] = amd_bytealign (w[59], w[60], offset); w[60] = amd_bytealign (w[58], w[59], offset); w[59] = amd_bytealign (w[57], w[58], offset); w[58] = amd_bytealign (w[56], w[57], offset); w[57] = amd_bytealign (w[55], w[56], offset); w[56] = amd_bytealign (w[54], w[55], offset); w[55] = amd_bytealign (w[53], w[54], offset); w[54] = amd_bytealign (w[52], w[53], offset); w[53] = amd_bytealign (w[51], w[52], offset); w[52] = amd_bytealign (w[50], w[51], offset); w[51] = amd_bytealign (w[49], w[50], offset); w[50] = amd_bytealign (w[48], w[49], offset); w[49] = amd_bytealign (w[47], w[48], offset); w[48] = amd_bytealign (w[46], w[47], offset); w[47] = amd_bytealign (w[45], w[46], offset); w[46] = amd_bytealign (w[44], w[45], offset); w[45] = amd_bytealign (w[43], w[44], offset); w[44] = amd_bytealign (w[42], w[43], offset); w[43] = amd_bytealign (w[41], w[42], offset); w[42] = amd_bytealign (w[40], w[41], offset); w[41] = amd_bytealign (w[39], w[40], offset); w[40] = amd_bytealign (w[38], w[39], offset); w[39] = amd_bytealign (w[37], w[38], offset); w[38] = amd_bytealign (w[36], w[37], offset); w[37] = amd_bytealign (w[35], w[36], offset); w[36] = amd_bytealign (w[34], w[35], offset); w[35] = amd_bytealign (w[33], w[34], offset); w[34] = amd_bytealign (w[32], w[33], offset); w[33] = amd_bytealign (w[31], w[32], offset); w[32] = amd_bytealign (w[30], w[31], offset); w[31] = amd_bytealign (w[29], w[30], offset); w[30] = amd_bytealign (w[28], w[29], offset); w[29] = amd_bytealign (w[27], w[28], offset); w[28] = amd_bytealign (w[26], w[27], offset); w[27] = amd_bytealign (w[25], w[26], offset); w[26] = amd_bytealign (w[24], w[25], offset); w[25] = amd_bytealign (w[23], w[24], offset); w[24] = amd_bytealign (w[22], w[23], offset); w[23] = amd_bytealign (w[21], w[22], offset); w[22] = amd_bytealign (w[20], w[21], offset); w[21] = amd_bytealign (w[19], w[20], offset); w[20] = amd_bytealign (w[18], w[19], offset); w[19] = amd_bytealign (w[17], w[18], offset); w[18] = amd_bytealign (w[16], w[17], offset); w[17] = amd_bytealign (w[15], w[16], offset); w[16] = amd_bytealign (w[14], w[15], offset); w[15] = amd_bytealign (w[13], w[14], offset); w[14] = amd_bytealign (w[12], w[13], offset); w[13] = amd_bytealign (w[11], w[12], offset); w[12] = amd_bytealign (w[10], w[11], offset); w[11] = amd_bytealign (w[ 9], w[10], offset); w[10] = amd_bytealign (w[ 8], w[ 9], offset); w[ 9] = amd_bytealign (w[ 7], w[ 8], offset); w[ 8] = amd_bytealign (w[ 6], w[ 7], offset); w[ 7] = amd_bytealign (w[ 5], w[ 6], offset); w[ 6] = amd_bytealign (w[ 4], w[ 5], offset); w[ 5] = amd_bytealign (w[ 3], w[ 4], offset); w[ 4] = amd_bytealign (w[ 2], w[ 3], offset); w[ 3] = amd_bytealign (w[ 1], w[ 2], offset); w[ 2] = amd_bytealign (w[ 0], w[ 1], offset); w[ 1] = amd_bytealign ( 0, w[ 0], offset); w[ 0] = 0; break; case 2: w[63] = amd_bytealign (w[60], w[61], offset); w[62] = amd_bytealign (w[59], w[60], offset); w[61] = amd_bytealign (w[58], w[59], offset); w[60] = amd_bytealign (w[57], w[58], offset); w[59] = amd_bytealign (w[56], w[57], offset); w[58] = amd_bytealign (w[55], w[56], offset); w[57] = amd_bytealign (w[54], w[55], offset); w[56] = amd_bytealign (w[53], w[54], offset); w[55] = amd_bytealign (w[52], w[53], offset); w[54] = amd_bytealign (w[51], w[52], offset); w[53] = amd_bytealign (w[50], w[51], offset); w[52] = amd_bytealign (w[49], w[50], offset); w[51] = amd_bytealign (w[48], w[49], offset); w[50] = amd_bytealign (w[47], w[48], offset); w[49] = amd_bytealign (w[46], w[47], offset); w[48] = amd_bytealign (w[45], w[46], offset); w[47] = amd_bytealign (w[44], w[45], offset); w[46] = amd_bytealign (w[43], w[44], offset); w[45] = amd_bytealign (w[42], w[43], offset); w[44] = amd_bytealign (w[41], w[42], offset); w[43] = amd_bytealign (w[40], w[41], offset); w[42] = amd_bytealign (w[39], w[40], offset); w[41] = amd_bytealign (w[38], w[39], offset); w[40] = amd_bytealign (w[37], w[38], offset); w[39] = amd_bytealign (w[36], w[37], offset); w[38] = amd_bytealign (w[35], w[36], offset); w[37] = amd_bytealign (w[34], w[35], offset); w[36] = amd_bytealign (w[33], w[34], offset); w[35] = amd_bytealign (w[32], w[33], offset); w[34] = amd_bytealign (w[31], w[32], offset); w[33] = amd_bytealign (w[30], w[31], offset); w[32] = amd_bytealign (w[29], w[30], offset); w[31] = amd_bytealign (w[28], w[29], offset); w[30] = amd_bytealign (w[27], w[28], offset); w[29] = amd_bytealign (w[26], w[27], offset); w[28] = amd_bytealign (w[25], w[26], offset); w[27] = amd_bytealign (w[24], w[25], offset); w[26] = amd_bytealign (w[23], w[24], offset); w[25] = amd_bytealign (w[22], w[23], offset); w[24] = amd_bytealign (w[21], w[22], offset); w[23] = amd_bytealign (w[20], w[21], offset); w[22] = amd_bytealign (w[19], w[20], offset); w[21] = amd_bytealign (w[18], w[19], offset); w[20] = amd_bytealign (w[17], w[18], offset); w[19] = amd_bytealign (w[16], w[17], offset); w[18] = amd_bytealign (w[15], w[16], offset); w[17] = amd_bytealign (w[14], w[15], offset); w[16] = amd_bytealign (w[13], w[14], offset); w[15] = amd_bytealign (w[12], w[13], offset); w[14] = amd_bytealign (w[11], w[12], offset); w[13] = amd_bytealign (w[10], w[11], offset); w[12] = amd_bytealign (w[ 9], w[10], offset); w[11] = amd_bytealign (w[ 8], w[ 9], offset); w[10] = amd_bytealign (w[ 7], w[ 8], offset); w[ 9] = amd_bytealign (w[ 6], w[ 7], offset); w[ 8] = amd_bytealign (w[ 5], w[ 6], offset); w[ 7] = amd_bytealign (w[ 4], w[ 5], offset); w[ 6] = amd_bytealign (w[ 3], w[ 4], offset); w[ 5] = amd_bytealign (w[ 2], w[ 3], offset); w[ 4] = amd_bytealign (w[ 1], w[ 2], offset); w[ 3] = amd_bytealign (w[ 0], w[ 1], offset); w[ 2] = amd_bytealign ( 0, w[ 0], offset); w[ 1] = 0; w[ 0] = 0; break; case 3: w[63] = amd_bytealign (w[59], w[60], offset); w[62] = amd_bytealign (w[58], w[59], offset); w[61] = amd_bytealign (w[57], w[58], offset); w[60] = amd_bytealign (w[56], w[57], offset); w[59] = amd_bytealign (w[55], w[56], offset); w[58] = amd_bytealign (w[54], w[55], offset); w[57] = amd_bytealign (w[53], w[54], offset); w[56] = amd_bytealign (w[52], w[53], offset); w[55] = amd_bytealign (w[51], w[52], offset); w[54] = amd_bytealign (w[50], w[51], offset); w[53] = amd_bytealign (w[49], w[50], offset); w[52] = amd_bytealign (w[48], w[49], offset); w[51] = amd_bytealign (w[47], w[48], offset); w[50] = amd_bytealign (w[46], w[47], offset); w[49] = amd_bytealign (w[45], w[46], offset); w[48] = amd_bytealign (w[44], w[45], offset); w[47] = amd_bytealign (w[43], w[44], offset); w[46] = amd_bytealign (w[42], w[43], offset); w[45] = amd_bytealign (w[41], w[42], offset); w[44] = amd_bytealign (w[40], w[41], offset); w[43] = amd_bytealign (w[39], w[40], offset); w[42] = amd_bytealign (w[38], w[39], offset); w[41] = amd_bytealign (w[37], w[38], offset); w[40] = amd_bytealign (w[36], w[37], offset); w[39] = amd_bytealign (w[35], w[36], offset); w[38] = amd_bytealign (w[34], w[35], offset); w[37] = amd_bytealign (w[33], w[34], offset); w[36] = amd_bytealign (w[32], w[33], offset); w[35] = amd_bytealign (w[31], w[32], offset); w[34] = amd_bytealign (w[30], w[31], offset); w[33] = amd_bytealign (w[29], w[30], offset); w[32] = amd_bytealign (w[28], w[29], offset); w[31] = amd_bytealign (w[27], w[28], offset); w[30] = amd_bytealign (w[26], w[27], offset); w[29] = amd_bytealign (w[25], w[26], offset); w[28] = amd_bytealign (w[24], w[25], offset); w[27] = amd_bytealign (w[23], w[24], offset); w[26] = amd_bytealign (w[22], w[23], offset); w[25] = amd_bytealign (w[21], w[22], offset); w[24] = amd_bytealign (w[20], w[21], offset); w[23] = amd_bytealign (w[19], w[20], offset); w[22] = amd_bytealign (w[18], w[19], offset); w[21] = amd_bytealign (w[17], w[18], offset); w[20] = amd_bytealign (w[16], w[17], offset); w[19] = amd_bytealign (w[15], w[16], offset); w[18] = amd_bytealign (w[14], w[15], offset); w[17] = amd_bytealign (w[13], w[14], offset); w[16] = amd_bytealign (w[12], w[13], offset); w[15] = amd_bytealign (w[11], w[12], offset); w[14] = amd_bytealign (w[10], w[11], offset); w[13] = amd_bytealign (w[ 9], w[10], offset); w[12] = amd_bytealign (w[ 8], w[ 9], offset); w[11] = amd_bytealign (w[ 7], w[ 8], offset); w[10] = amd_bytealign (w[ 6], w[ 7], offset); w[ 9] = amd_bytealign (w[ 5], w[ 6], offset); w[ 8] = amd_bytealign (w[ 4], w[ 5], offset); w[ 7] = amd_bytealign (w[ 3], w[ 4], offset); w[ 6] = amd_bytealign (w[ 2], w[ 3], offset); w[ 5] = amd_bytealign (w[ 1], w[ 2], offset); w[ 4] = amd_bytealign (w[ 0], w[ 1], offset); w[ 3] = amd_bytealign ( 0, w[ 0], offset); w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 4: w[63] = amd_bytealign (w[58], w[59], offset); w[62] = amd_bytealign (w[57], w[58], offset); w[61] = amd_bytealign (w[56], w[57], offset); w[60] = amd_bytealign (w[55], w[56], offset); w[59] = amd_bytealign (w[54], w[55], offset); w[58] = amd_bytealign (w[53], w[54], offset); w[57] = amd_bytealign (w[52], w[53], offset); w[56] = amd_bytealign (w[51], w[52], offset); w[55] = amd_bytealign (w[50], w[51], offset); w[54] = amd_bytealign (w[49], w[50], offset); w[53] = amd_bytealign (w[48], w[49], offset); w[52] = amd_bytealign (w[47], w[48], offset); w[51] = amd_bytealign (w[46], w[47], offset); w[50] = amd_bytealign (w[45], w[46], offset); w[49] = amd_bytealign (w[44], w[45], offset); w[48] = amd_bytealign (w[43], w[44], offset); w[47] = amd_bytealign (w[42], w[43], offset); w[46] = amd_bytealign (w[41], w[42], offset); w[45] = amd_bytealign (w[40], w[41], offset); w[44] = amd_bytealign (w[39], w[40], offset); w[43] = amd_bytealign (w[38], w[39], offset); w[42] = amd_bytealign (w[37], w[38], offset); w[41] = amd_bytealign (w[36], w[37], offset); w[40] = amd_bytealign (w[35], w[36], offset); w[39] = amd_bytealign (w[34], w[35], offset); w[38] = amd_bytealign (w[33], w[34], offset); w[37] = amd_bytealign (w[32], w[33], offset); w[36] = amd_bytealign (w[31], w[32], offset); w[35] = amd_bytealign (w[30], w[31], offset); w[34] = amd_bytealign (w[29], w[30], offset); w[33] = amd_bytealign (w[28], w[29], offset); w[32] = amd_bytealign (w[27], w[28], offset); w[31] = amd_bytealign (w[26], w[27], offset); w[30] = amd_bytealign (w[25], w[26], offset); w[29] = amd_bytealign (w[24], w[25], offset); w[28] = amd_bytealign (w[23], w[24], offset); w[27] = amd_bytealign (w[22], w[23], offset); w[26] = amd_bytealign (w[21], w[22], offset); w[25] = amd_bytealign (w[20], w[21], offset); w[24] = amd_bytealign (w[19], w[20], offset); w[23] = amd_bytealign (w[18], w[19], offset); w[22] = amd_bytealign (w[17], w[18], offset); w[21] = amd_bytealign (w[16], w[17], offset); w[20] = amd_bytealign (w[15], w[16], offset); w[19] = amd_bytealign (w[14], w[15], offset); w[18] = amd_bytealign (w[13], w[14], offset); w[17] = amd_bytealign (w[12], w[13], offset); w[16] = amd_bytealign (w[11], w[12], offset); w[15] = amd_bytealign (w[10], w[11], offset); w[14] = amd_bytealign (w[ 9], w[10], offset); w[13] = amd_bytealign (w[ 8], w[ 9], offset); w[12] = amd_bytealign (w[ 7], w[ 8], offset); w[11] = amd_bytealign (w[ 6], w[ 7], offset); w[10] = amd_bytealign (w[ 5], w[ 6], offset); w[ 9] = amd_bytealign (w[ 4], w[ 5], offset); w[ 8] = amd_bytealign (w[ 3], w[ 4], offset); w[ 7] = amd_bytealign (w[ 2], w[ 3], offset); w[ 6] = amd_bytealign (w[ 1], w[ 2], offset); w[ 5] = amd_bytealign (w[ 0], w[ 1], offset); w[ 4] = amd_bytealign ( 0, w[ 0], offset); w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 5: w[63] = amd_bytealign (w[57], w[58], offset); w[62] = amd_bytealign (w[56], w[57], offset); w[61] = amd_bytealign (w[55], w[56], offset); w[60] = amd_bytealign (w[54], w[55], offset); w[59] = amd_bytealign (w[53], w[54], offset); w[58] = amd_bytealign (w[52], w[53], offset); w[57] = amd_bytealign (w[51], w[52], offset); w[56] = amd_bytealign (w[50], w[51], offset); w[55] = amd_bytealign (w[49], w[50], offset); w[54] = amd_bytealign (w[48], w[49], offset); w[53] = amd_bytealign (w[47], w[48], offset); w[52] = amd_bytealign (w[46], w[47], offset); w[51] = amd_bytealign (w[45], w[46], offset); w[50] = amd_bytealign (w[44], w[45], offset); w[49] = amd_bytealign (w[43], w[44], offset); w[48] = amd_bytealign (w[42], w[43], offset); w[47] = amd_bytealign (w[41], w[42], offset); w[46] = amd_bytealign (w[40], w[41], offset); w[45] = amd_bytealign (w[39], w[40], offset); w[44] = amd_bytealign (w[38], w[39], offset); w[43] = amd_bytealign (w[37], w[38], offset); w[42] = amd_bytealign (w[36], w[37], offset); w[41] = amd_bytealign (w[35], w[36], offset); w[40] = amd_bytealign (w[34], w[35], offset); w[39] = amd_bytealign (w[33], w[34], offset); w[38] = amd_bytealign (w[32], w[33], offset); w[37] = amd_bytealign (w[31], w[32], offset); w[36] = amd_bytealign (w[30], w[31], offset); w[35] = amd_bytealign (w[29], w[30], offset); w[34] = amd_bytealign (w[28], w[29], offset); w[33] = amd_bytealign (w[27], w[28], offset); w[32] = amd_bytealign (w[26], w[27], offset); w[31] = amd_bytealign (w[25], w[26], offset); w[30] = amd_bytealign (w[24], w[25], offset); w[29] = amd_bytealign (w[23], w[24], offset); w[28] = amd_bytealign (w[22], w[23], offset); w[27] = amd_bytealign (w[21], w[22], offset); w[26] = amd_bytealign (w[20], w[21], offset); w[25] = amd_bytealign (w[19], w[20], offset); w[24] = amd_bytealign (w[18], w[19], offset); w[23] = amd_bytealign (w[17], w[18], offset); w[22] = amd_bytealign (w[16], w[17], offset); w[21] = amd_bytealign (w[15], w[16], offset); w[20] = amd_bytealign (w[14], w[15], offset); w[19] = amd_bytealign (w[13], w[14], offset); w[18] = amd_bytealign (w[12], w[13], offset); w[17] = amd_bytealign (w[11], w[12], offset); w[16] = amd_bytealign (w[10], w[11], offset); w[15] = amd_bytealign (w[ 9], w[10], offset); w[14] = amd_bytealign (w[ 8], w[ 9], offset); w[13] = amd_bytealign (w[ 7], w[ 8], offset); w[12] = amd_bytealign (w[ 6], w[ 7], offset); w[11] = amd_bytealign (w[ 5], w[ 6], offset); w[10] = amd_bytealign (w[ 4], w[ 5], offset); w[ 9] = amd_bytealign (w[ 3], w[ 4], offset); w[ 8] = amd_bytealign (w[ 2], w[ 3], offset); w[ 7] = amd_bytealign (w[ 1], w[ 2], offset); w[ 6] = amd_bytealign (w[ 0], w[ 1], offset); w[ 5] = amd_bytealign ( 0, w[ 0], offset); w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 6: w[63] = amd_bytealign (w[56], w[57], offset); w[62] = amd_bytealign (w[55], w[56], offset); w[61] = amd_bytealign (w[54], w[55], offset); w[60] = amd_bytealign (w[53], w[54], offset); w[59] = amd_bytealign (w[52], w[53], offset); w[58] = amd_bytealign (w[51], w[52], offset); w[57] = amd_bytealign (w[50], w[51], offset); w[56] = amd_bytealign (w[49], w[50], offset); w[55] = amd_bytealign (w[48], w[49], offset); w[54] = amd_bytealign (w[47], w[48], offset); w[53] = amd_bytealign (w[46], w[47], offset); w[52] = amd_bytealign (w[45], w[46], offset); w[51] = amd_bytealign (w[44], w[45], offset); w[50] = amd_bytealign (w[43], w[44], offset); w[49] = amd_bytealign (w[42], w[43], offset); w[48] = amd_bytealign (w[41], w[42], offset); w[47] = amd_bytealign (w[40], w[41], offset); w[46] = amd_bytealign (w[39], w[40], offset); w[45] = amd_bytealign (w[38], w[39], offset); w[44] = amd_bytealign (w[37], w[38], offset); w[43] = amd_bytealign (w[36], w[37], offset); w[42] = amd_bytealign (w[35], w[36], offset); w[41] = amd_bytealign (w[34], w[35], offset); w[40] = amd_bytealign (w[33], w[34], offset); w[39] = amd_bytealign (w[32], w[33], offset); w[38] = amd_bytealign (w[31], w[32], offset); w[37] = amd_bytealign (w[30], w[31], offset); w[36] = amd_bytealign (w[29], w[30], offset); w[35] = amd_bytealign (w[28], w[29], offset); w[34] = amd_bytealign (w[27], w[28], offset); w[33] = amd_bytealign (w[26], w[27], offset); w[32] = amd_bytealign (w[25], w[26], offset); w[31] = amd_bytealign (w[24], w[25], offset); w[30] = amd_bytealign (w[23], w[24], offset); w[29] = amd_bytealign (w[22], w[23], offset); w[28] = amd_bytealign (w[21], w[22], offset); w[27] = amd_bytealign (w[20], w[21], offset); w[26] = amd_bytealign (w[19], w[20], offset); w[25] = amd_bytealign (w[18], w[19], offset); w[24] = amd_bytealign (w[17], w[18], offset); w[23] = amd_bytealign (w[16], w[17], offset); w[22] = amd_bytealign (w[15], w[16], offset); w[21] = amd_bytealign (w[14], w[15], offset); w[20] = amd_bytealign (w[13], w[14], offset); w[19] = amd_bytealign (w[12], w[13], offset); w[18] = amd_bytealign (w[11], w[12], offset); w[17] = amd_bytealign (w[10], w[11], offset); w[16] = amd_bytealign (w[ 9], w[10], offset); w[15] = amd_bytealign (w[ 8], w[ 9], offset); w[14] = amd_bytealign (w[ 7], w[ 8], offset); w[13] = amd_bytealign (w[ 6], w[ 7], offset); w[12] = amd_bytealign (w[ 5], w[ 6], offset); w[11] = amd_bytealign (w[ 4], w[ 5], offset); w[10] = amd_bytealign (w[ 3], w[ 4], offset); w[ 9] = amd_bytealign (w[ 2], w[ 3], offset); w[ 8] = amd_bytealign (w[ 1], w[ 2], offset); w[ 7] = amd_bytealign (w[ 0], w[ 1], offset); w[ 6] = amd_bytealign ( 0, w[ 0], offset); w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 7: w[63] = amd_bytealign (w[55], w[56], offset); w[62] = amd_bytealign (w[54], w[55], offset); w[61] = amd_bytealign (w[53], w[54], offset); w[60] = amd_bytealign (w[52], w[53], offset); w[59] = amd_bytealign (w[51], w[52], offset); w[58] = amd_bytealign (w[50], w[51], offset); w[57] = amd_bytealign (w[49], w[50], offset); w[56] = amd_bytealign (w[48], w[49], offset); w[55] = amd_bytealign (w[47], w[48], offset); w[54] = amd_bytealign (w[46], w[47], offset); w[53] = amd_bytealign (w[45], w[46], offset); w[52] = amd_bytealign (w[44], w[45], offset); w[51] = amd_bytealign (w[43], w[44], offset); w[50] = amd_bytealign (w[42], w[43], offset); w[49] = amd_bytealign (w[41], w[42], offset); w[48] = amd_bytealign (w[40], w[41], offset); w[47] = amd_bytealign (w[39], w[40], offset); w[46] = amd_bytealign (w[38], w[39], offset); w[45] = amd_bytealign (w[37], w[38], offset); w[44] = amd_bytealign (w[36], w[37], offset); w[43] = amd_bytealign (w[35], w[36], offset); w[42] = amd_bytealign (w[34], w[35], offset); w[41] = amd_bytealign (w[33], w[34], offset); w[40] = amd_bytealign (w[32], w[33], offset); w[39] = amd_bytealign (w[31], w[32], offset); w[38] = amd_bytealign (w[30], w[31], offset); w[37] = amd_bytealign (w[29], w[30], offset); w[36] = amd_bytealign (w[28], w[29], offset); w[35] = amd_bytealign (w[27], w[28], offset); w[34] = amd_bytealign (w[26], w[27], offset); w[33] = amd_bytealign (w[25], w[26], offset); w[32] = amd_bytealign (w[24], w[25], offset); w[31] = amd_bytealign (w[23], w[24], offset); w[30] = amd_bytealign (w[22], w[23], offset); w[29] = amd_bytealign (w[21], w[22], offset); w[28] = amd_bytealign (w[20], w[21], offset); w[27] = amd_bytealign (w[19], w[20], offset); w[26] = amd_bytealign (w[18], w[19], offset); w[25] = amd_bytealign (w[17], w[18], offset); w[24] = amd_bytealign (w[16], w[17], offset); w[23] = amd_bytealign (w[15], w[16], offset); w[22] = amd_bytealign (w[14], w[15], offset); w[21] = amd_bytealign (w[13], w[14], offset); w[20] = amd_bytealign (w[12], w[13], offset); w[19] = amd_bytealign (w[11], w[12], offset); w[18] = amd_bytealign (w[10], w[11], offset); w[17] = amd_bytealign (w[ 9], w[10], offset); w[16] = amd_bytealign (w[ 8], w[ 9], offset); w[15] = amd_bytealign (w[ 7], w[ 8], offset); w[14] = amd_bytealign (w[ 6], w[ 7], offset); w[13] = amd_bytealign (w[ 5], w[ 6], offset); w[12] = amd_bytealign (w[ 4], w[ 5], offset); w[11] = amd_bytealign (w[ 3], w[ 4], offset); w[10] = amd_bytealign (w[ 2], w[ 3], offset); w[ 9] = amd_bytealign (w[ 1], w[ 2], offset); w[ 8] = amd_bytealign (w[ 0], w[ 1], offset); w[ 7] = amd_bytealign ( 0, w[ 0], offset); w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 8: w[63] = amd_bytealign (w[54], w[55], offset); w[62] = amd_bytealign (w[53], w[54], offset); w[61] = amd_bytealign (w[52], w[53], offset); w[60] = amd_bytealign (w[51], w[52], offset); w[59] = amd_bytealign (w[50], w[51], offset); w[58] = amd_bytealign (w[49], w[50], offset); w[57] = amd_bytealign (w[48], w[49], offset); w[56] = amd_bytealign (w[47], w[48], offset); w[55] = amd_bytealign (w[46], w[47], offset); w[54] = amd_bytealign (w[45], w[46], offset); w[53] = amd_bytealign (w[44], w[45], offset); w[52] = amd_bytealign (w[43], w[44], offset); w[51] = amd_bytealign (w[42], w[43], offset); w[50] = amd_bytealign (w[41], w[42], offset); w[49] = amd_bytealign (w[40], w[41], offset); w[48] = amd_bytealign (w[39], w[40], offset); w[47] = amd_bytealign (w[38], w[39], offset); w[46] = amd_bytealign (w[37], w[38], offset); w[45] = amd_bytealign (w[36], w[37], offset); w[44] = amd_bytealign (w[35], w[36], offset); w[43] = amd_bytealign (w[34], w[35], offset); w[42] = amd_bytealign (w[33], w[34], offset); w[41] = amd_bytealign (w[32], w[33], offset); w[40] = amd_bytealign (w[31], w[32], offset); w[39] = amd_bytealign (w[30], w[31], offset); w[38] = amd_bytealign (w[29], w[30], offset); w[37] = amd_bytealign (w[28], w[29], offset); w[36] = amd_bytealign (w[27], w[28], offset); w[35] = amd_bytealign (w[26], w[27], offset); w[34] = amd_bytealign (w[25], w[26], offset); w[33] = amd_bytealign (w[24], w[25], offset); w[32] = amd_bytealign (w[23], w[24], offset); w[31] = amd_bytealign (w[22], w[23], offset); w[30] = amd_bytealign (w[21], w[22], offset); w[29] = amd_bytealign (w[20], w[21], offset); w[28] = amd_bytealign (w[19], w[20], offset); w[27] = amd_bytealign (w[18], w[19], offset); w[26] = amd_bytealign (w[17], w[18], offset); w[25] = amd_bytealign (w[16], w[17], offset); w[24] = amd_bytealign (w[15], w[16], offset); w[23] = amd_bytealign (w[14], w[15], offset); w[22] = amd_bytealign (w[13], w[14], offset); w[21] = amd_bytealign (w[12], w[13], offset); w[20] = amd_bytealign (w[11], w[12], offset); w[19] = amd_bytealign (w[10], w[11], offset); w[18] = amd_bytealign (w[ 9], w[10], offset); w[17] = amd_bytealign (w[ 8], w[ 9], offset); w[16] = amd_bytealign (w[ 7], w[ 8], offset); w[15] = amd_bytealign (w[ 6], w[ 7], offset); w[14] = amd_bytealign (w[ 5], w[ 6], offset); w[13] = amd_bytealign (w[ 4], w[ 5], offset); w[12] = amd_bytealign (w[ 3], w[ 4], offset); w[11] = amd_bytealign (w[ 2], w[ 3], offset); w[10] = amd_bytealign (w[ 1], w[ 2], offset); w[ 9] = amd_bytealign (w[ 0], w[ 1], offset); w[ 8] = amd_bytealign ( 0, w[ 0], offset); w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 9: w[63] = amd_bytealign (w[53], w[54], offset); w[62] = amd_bytealign (w[52], w[53], offset); w[61] = amd_bytealign (w[51], w[52], offset); w[60] = amd_bytealign (w[50], w[51], offset); w[59] = amd_bytealign (w[49], w[50], offset); w[58] = amd_bytealign (w[48], w[49], offset); w[57] = amd_bytealign (w[47], w[48], offset); w[56] = amd_bytealign (w[46], w[47], offset); w[55] = amd_bytealign (w[45], w[46], offset); w[54] = amd_bytealign (w[44], w[45], offset); w[53] = amd_bytealign (w[43], w[44], offset); w[52] = amd_bytealign (w[42], w[43], offset); w[51] = amd_bytealign (w[41], w[42], offset); w[50] = amd_bytealign (w[40], w[41], offset); w[49] = amd_bytealign (w[39], w[40], offset); w[48] = amd_bytealign (w[38], w[39], offset); w[47] = amd_bytealign (w[37], w[38], offset); w[46] = amd_bytealign (w[36], w[37], offset); w[45] = amd_bytealign (w[35], w[36], offset); w[44] = amd_bytealign (w[34], w[35], offset); w[43] = amd_bytealign (w[33], w[34], offset); w[42] = amd_bytealign (w[32], w[33], offset); w[41] = amd_bytealign (w[31], w[32], offset); w[40] = amd_bytealign (w[30], w[31], offset); w[39] = amd_bytealign (w[29], w[30], offset); w[38] = amd_bytealign (w[28], w[29], offset); w[37] = amd_bytealign (w[27], w[28], offset); w[36] = amd_bytealign (w[26], w[27], offset); w[35] = amd_bytealign (w[25], w[26], offset); w[34] = amd_bytealign (w[24], w[25], offset); w[33] = amd_bytealign (w[23], w[24], offset); w[32] = amd_bytealign (w[22], w[23], offset); w[31] = amd_bytealign (w[21], w[22], offset); w[30] = amd_bytealign (w[20], w[21], offset); w[29] = amd_bytealign (w[19], w[20], offset); w[28] = amd_bytealign (w[18], w[19], offset); w[27] = amd_bytealign (w[17], w[18], offset); w[26] = amd_bytealign (w[16], w[17], offset); w[25] = amd_bytealign (w[15], w[16], offset); w[24] = amd_bytealign (w[14], w[15], offset); w[23] = amd_bytealign (w[13], w[14], offset); w[22] = amd_bytealign (w[12], w[13], offset); w[21] = amd_bytealign (w[11], w[12], offset); w[20] = amd_bytealign (w[10], w[11], offset); w[19] = amd_bytealign (w[ 9], w[10], offset); w[18] = amd_bytealign (w[ 8], w[ 9], offset); w[17] = amd_bytealign (w[ 7], w[ 8], offset); w[16] = amd_bytealign (w[ 6], w[ 7], offset); w[15] = amd_bytealign (w[ 5], w[ 6], offset); w[14] = amd_bytealign (w[ 4], w[ 5], offset); w[13] = amd_bytealign (w[ 3], w[ 4], offset); w[12] = amd_bytealign (w[ 2], w[ 3], offset); w[11] = amd_bytealign (w[ 1], w[ 2], offset); w[10] = amd_bytealign (w[ 0], w[ 1], offset); w[ 9] = amd_bytealign ( 0, w[ 0], offset); w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 10: w[63] = amd_bytealign (w[52], w[53], offset); w[62] = amd_bytealign (w[51], w[52], offset); w[61] = amd_bytealign (w[50], w[51], offset); w[60] = amd_bytealign (w[49], w[50], offset); w[59] = amd_bytealign (w[48], w[49], offset); w[58] = amd_bytealign (w[47], w[48], offset); w[57] = amd_bytealign (w[46], w[47], offset); w[56] = amd_bytealign (w[45], w[46], offset); w[55] = amd_bytealign (w[44], w[45], offset); w[54] = amd_bytealign (w[43], w[44], offset); w[53] = amd_bytealign (w[42], w[43], offset); w[52] = amd_bytealign (w[41], w[42], offset); w[51] = amd_bytealign (w[40], w[41], offset); w[50] = amd_bytealign (w[39], w[40], offset); w[49] = amd_bytealign (w[38], w[39], offset); w[48] = amd_bytealign (w[37], w[38], offset); w[47] = amd_bytealign (w[36], w[37], offset); w[46] = amd_bytealign (w[35], w[36], offset); w[45] = amd_bytealign (w[34], w[35], offset); w[44] = amd_bytealign (w[33], w[34], offset); w[43] = amd_bytealign (w[32], w[33], offset); w[42] = amd_bytealign (w[31], w[32], offset); w[41] = amd_bytealign (w[30], w[31], offset); w[40] = amd_bytealign (w[29], w[30], offset); w[39] = amd_bytealign (w[28], w[29], offset); w[38] = amd_bytealign (w[27], w[28], offset); w[37] = amd_bytealign (w[26], w[27], offset); w[36] = amd_bytealign (w[25], w[26], offset); w[35] = amd_bytealign (w[24], w[25], offset); w[34] = amd_bytealign (w[23], w[24], offset); w[33] = amd_bytealign (w[22], w[23], offset); w[32] = amd_bytealign (w[21], w[22], offset); w[31] = amd_bytealign (w[20], w[21], offset); w[30] = amd_bytealign (w[19], w[20], offset); w[29] = amd_bytealign (w[18], w[19], offset); w[28] = amd_bytealign (w[17], w[18], offset); w[27] = amd_bytealign (w[16], w[17], offset); w[26] = amd_bytealign (w[15], w[16], offset); w[25] = amd_bytealign (w[14], w[15], offset); w[24] = amd_bytealign (w[13], w[14], offset); w[23] = amd_bytealign (w[12], w[13], offset); w[22] = amd_bytealign (w[11], w[12], offset); w[21] = amd_bytealign (w[10], w[11], offset); w[20] = amd_bytealign (w[ 9], w[10], offset); w[19] = amd_bytealign (w[ 8], w[ 9], offset); w[18] = amd_bytealign (w[ 7], w[ 8], offset); w[17] = amd_bytealign (w[ 6], w[ 7], offset); w[16] = amd_bytealign (w[ 5], w[ 6], offset); w[15] = amd_bytealign (w[ 4], w[ 5], offset); w[14] = amd_bytealign (w[ 3], w[ 4], offset); w[13] = amd_bytealign (w[ 2], w[ 3], offset); w[12] = amd_bytealign (w[ 1], w[ 2], offset); w[11] = amd_bytealign (w[ 0], w[ 1], offset); w[10] = amd_bytealign ( 0, w[ 0], offset); w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 11: w[63] = amd_bytealign (w[51], w[52], offset); w[62] = amd_bytealign (w[50], w[51], offset); w[61] = amd_bytealign (w[49], w[50], offset); w[60] = amd_bytealign (w[48], w[49], offset); w[59] = amd_bytealign (w[47], w[48], offset); w[58] = amd_bytealign (w[46], w[47], offset); w[57] = amd_bytealign (w[45], w[46], offset); w[56] = amd_bytealign (w[44], w[45], offset); w[55] = amd_bytealign (w[43], w[44], offset); w[54] = amd_bytealign (w[42], w[43], offset); w[53] = amd_bytealign (w[41], w[42], offset); w[52] = amd_bytealign (w[40], w[41], offset); w[51] = amd_bytealign (w[39], w[40], offset); w[50] = amd_bytealign (w[38], w[39], offset); w[49] = amd_bytealign (w[37], w[38], offset); w[48] = amd_bytealign (w[36], w[37], offset); w[47] = amd_bytealign (w[35], w[36], offset); w[46] = amd_bytealign (w[34], w[35], offset); w[45] = amd_bytealign (w[33], w[34], offset); w[44] = amd_bytealign (w[32], w[33], offset); w[43] = amd_bytealign (w[31], w[32], offset); w[42] = amd_bytealign (w[30], w[31], offset); w[41] = amd_bytealign (w[29], w[30], offset); w[40] = amd_bytealign (w[28], w[29], offset); w[39] = amd_bytealign (w[27], w[28], offset); w[38] = amd_bytealign (w[26], w[27], offset); w[37] = amd_bytealign (w[25], w[26], offset); w[36] = amd_bytealign (w[24], w[25], offset); w[35] = amd_bytealign (w[23], w[24], offset); w[34] = amd_bytealign (w[22], w[23], offset); w[33] = amd_bytealign (w[21], w[22], offset); w[32] = amd_bytealign (w[20], w[21], offset); w[31] = amd_bytealign (w[19], w[20], offset); w[30] = amd_bytealign (w[18], w[19], offset); w[29] = amd_bytealign (w[17], w[18], offset); w[28] = amd_bytealign (w[16], w[17], offset); w[27] = amd_bytealign (w[15], w[16], offset); w[26] = amd_bytealign (w[14], w[15], offset); w[25] = amd_bytealign (w[13], w[14], offset); w[24] = amd_bytealign (w[12], w[13], offset); w[23] = amd_bytealign (w[11], w[12], offset); w[22] = amd_bytealign (w[10], w[11], offset); w[21] = amd_bytealign (w[ 9], w[10], offset); w[20] = amd_bytealign (w[ 8], w[ 9], offset); w[19] = amd_bytealign (w[ 7], w[ 8], offset); w[18] = amd_bytealign (w[ 6], w[ 7], offset); w[17] = amd_bytealign (w[ 5], w[ 6], offset); w[16] = amd_bytealign (w[ 4], w[ 5], offset); w[15] = amd_bytealign (w[ 3], w[ 4], offset); w[14] = amd_bytealign (w[ 2], w[ 3], offset); w[13] = amd_bytealign (w[ 1], w[ 2], offset); w[12] = amd_bytealign (w[ 0], w[ 1], offset); w[11] = amd_bytealign ( 0, w[ 0], offset); w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 12: w[63] = amd_bytealign (w[50], w[51], offset); w[62] = amd_bytealign (w[49], w[50], offset); w[61] = amd_bytealign (w[48], w[49], offset); w[60] = amd_bytealign (w[47], w[48], offset); w[59] = amd_bytealign (w[46], w[47], offset); w[58] = amd_bytealign (w[45], w[46], offset); w[57] = amd_bytealign (w[44], w[45], offset); w[56] = amd_bytealign (w[43], w[44], offset); w[55] = amd_bytealign (w[42], w[43], offset); w[54] = amd_bytealign (w[41], w[42], offset); w[53] = amd_bytealign (w[40], w[41], offset); w[52] = amd_bytealign (w[39], w[40], offset); w[51] = amd_bytealign (w[38], w[39], offset); w[50] = amd_bytealign (w[37], w[38], offset); w[49] = amd_bytealign (w[36], w[37], offset); w[48] = amd_bytealign (w[35], w[36], offset); w[47] = amd_bytealign (w[34], w[35], offset); w[46] = amd_bytealign (w[33], w[34], offset); w[45] = amd_bytealign (w[32], w[33], offset); w[44] = amd_bytealign (w[31], w[32], offset); w[43] = amd_bytealign (w[30], w[31], offset); w[42] = amd_bytealign (w[29], w[30], offset); w[41] = amd_bytealign (w[28], w[29], offset); w[40] = amd_bytealign (w[27], w[28], offset); w[39] = amd_bytealign (w[26], w[27], offset); w[38] = amd_bytealign (w[25], w[26], offset); w[37] = amd_bytealign (w[24], w[25], offset); w[36] = amd_bytealign (w[23], w[24], offset); w[35] = amd_bytealign (w[22], w[23], offset); w[34] = amd_bytealign (w[21], w[22], offset); w[33] = amd_bytealign (w[20], w[21], offset); w[32] = amd_bytealign (w[19], w[20], offset); w[31] = amd_bytealign (w[18], w[19], offset); w[30] = amd_bytealign (w[17], w[18], offset); w[29] = amd_bytealign (w[16], w[17], offset); w[28] = amd_bytealign (w[15], w[16], offset); w[27] = amd_bytealign (w[14], w[15], offset); w[26] = amd_bytealign (w[13], w[14], offset); w[25] = amd_bytealign (w[12], w[13], offset); w[24] = amd_bytealign (w[11], w[12], offset); w[23] = amd_bytealign (w[10], w[11], offset); w[22] = amd_bytealign (w[ 9], w[10], offset); w[21] = amd_bytealign (w[ 8], w[ 9], offset); w[20] = amd_bytealign (w[ 7], w[ 8], offset); w[19] = amd_bytealign (w[ 6], w[ 7], offset); w[18] = amd_bytealign (w[ 5], w[ 6], offset); w[17] = amd_bytealign (w[ 4], w[ 5], offset); w[16] = amd_bytealign (w[ 3], w[ 4], offset); w[15] = amd_bytealign (w[ 2], w[ 3], offset); w[14] = amd_bytealign (w[ 1], w[ 2], offset); w[13] = amd_bytealign (w[ 0], w[ 1], offset); w[12] = amd_bytealign ( 0, w[ 0], offset); w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 13: w[63] = amd_bytealign (w[49], w[50], offset); w[62] = amd_bytealign (w[48], w[49], offset); w[61] = amd_bytealign (w[47], w[48], offset); w[60] = amd_bytealign (w[46], w[47], offset); w[59] = amd_bytealign (w[45], w[46], offset); w[58] = amd_bytealign (w[44], w[45], offset); w[57] = amd_bytealign (w[43], w[44], offset); w[56] = amd_bytealign (w[42], w[43], offset); w[55] = amd_bytealign (w[41], w[42], offset); w[54] = amd_bytealign (w[40], w[41], offset); w[53] = amd_bytealign (w[39], w[40], offset); w[52] = amd_bytealign (w[38], w[39], offset); w[51] = amd_bytealign (w[37], w[38], offset); w[50] = amd_bytealign (w[36], w[37], offset); w[49] = amd_bytealign (w[35], w[36], offset); w[48] = amd_bytealign (w[34], w[35], offset); w[47] = amd_bytealign (w[33], w[34], offset); w[46] = amd_bytealign (w[32], w[33], offset); w[45] = amd_bytealign (w[31], w[32], offset); w[44] = amd_bytealign (w[30], w[31], offset); w[43] = amd_bytealign (w[29], w[30], offset); w[42] = amd_bytealign (w[28], w[29], offset); w[41] = amd_bytealign (w[27], w[28], offset); w[40] = amd_bytealign (w[26], w[27], offset); w[39] = amd_bytealign (w[25], w[26], offset); w[38] = amd_bytealign (w[24], w[25], offset); w[37] = amd_bytealign (w[23], w[24], offset); w[36] = amd_bytealign (w[22], w[23], offset); w[35] = amd_bytealign (w[21], w[22], offset); w[34] = amd_bytealign (w[20], w[21], offset); w[33] = amd_bytealign (w[19], w[20], offset); w[32] = amd_bytealign (w[18], w[19], offset); w[31] = amd_bytealign (w[17], w[18], offset); w[30] = amd_bytealign (w[16], w[17], offset); w[29] = amd_bytealign (w[15], w[16], offset); w[28] = amd_bytealign (w[14], w[15], offset); w[27] = amd_bytealign (w[13], w[14], offset); w[26] = amd_bytealign (w[12], w[13], offset); w[25] = amd_bytealign (w[11], w[12], offset); w[24] = amd_bytealign (w[10], w[11], offset); w[23] = amd_bytealign (w[ 9], w[10], offset); w[22] = amd_bytealign (w[ 8], w[ 9], offset); w[21] = amd_bytealign (w[ 7], w[ 8], offset); w[20] = amd_bytealign (w[ 6], w[ 7], offset); w[19] = amd_bytealign (w[ 5], w[ 6], offset); w[18] = amd_bytealign (w[ 4], w[ 5], offset); w[17] = amd_bytealign (w[ 3], w[ 4], offset); w[16] = amd_bytealign (w[ 2], w[ 3], offset); w[15] = amd_bytealign (w[ 1], w[ 2], offset); w[14] = amd_bytealign (w[ 0], w[ 1], offset); w[13] = amd_bytealign ( 0, w[ 0], offset); w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 14: w[63] = amd_bytealign (w[48], w[49], offset); w[62] = amd_bytealign (w[47], w[48], offset); w[61] = amd_bytealign (w[46], w[47], offset); w[60] = amd_bytealign (w[45], w[46], offset); w[59] = amd_bytealign (w[44], w[45], offset); w[58] = amd_bytealign (w[43], w[44], offset); w[57] = amd_bytealign (w[42], w[43], offset); w[56] = amd_bytealign (w[41], w[42], offset); w[55] = amd_bytealign (w[40], w[41], offset); w[54] = amd_bytealign (w[39], w[40], offset); w[53] = amd_bytealign (w[38], w[39], offset); w[52] = amd_bytealign (w[37], w[38], offset); w[51] = amd_bytealign (w[36], w[37], offset); w[50] = amd_bytealign (w[35], w[36], offset); w[49] = amd_bytealign (w[34], w[35], offset); w[48] = amd_bytealign (w[33], w[34], offset); w[47] = amd_bytealign (w[32], w[33], offset); w[46] = amd_bytealign (w[31], w[32], offset); w[45] = amd_bytealign (w[30], w[31], offset); w[44] = amd_bytealign (w[29], w[30], offset); w[43] = amd_bytealign (w[28], w[29], offset); w[42] = amd_bytealign (w[27], w[28], offset); w[41] = amd_bytealign (w[26], w[27], offset); w[40] = amd_bytealign (w[25], w[26], offset); w[39] = amd_bytealign (w[24], w[25], offset); w[38] = amd_bytealign (w[23], w[24], offset); w[37] = amd_bytealign (w[22], w[23], offset); w[36] = amd_bytealign (w[21], w[22], offset); w[35] = amd_bytealign (w[20], w[21], offset); w[34] = amd_bytealign (w[19], w[20], offset); w[33] = amd_bytealign (w[18], w[19], offset); w[32] = amd_bytealign (w[17], w[18], offset); w[31] = amd_bytealign (w[16], w[17], offset); w[30] = amd_bytealign (w[15], w[16], offset); w[29] = amd_bytealign (w[14], w[15], offset); w[28] = amd_bytealign (w[13], w[14], offset); w[27] = amd_bytealign (w[12], w[13], offset); w[26] = amd_bytealign (w[11], w[12], offset); w[25] = amd_bytealign (w[10], w[11], offset); w[24] = amd_bytealign (w[ 9], w[10], offset); w[23] = amd_bytealign (w[ 8], w[ 9], offset); w[22] = amd_bytealign (w[ 7], w[ 8], offset); w[21] = amd_bytealign (w[ 6], w[ 7], offset); w[20] = amd_bytealign (w[ 5], w[ 6], offset); w[19] = amd_bytealign (w[ 4], w[ 5], offset); w[18] = amd_bytealign (w[ 3], w[ 4], offset); w[17] = amd_bytealign (w[ 2], w[ 3], offset); w[16] = amd_bytealign (w[ 1], w[ 2], offset); w[15] = amd_bytealign (w[ 0], w[ 1], offset); w[14] = amd_bytealign ( 0, w[ 0], offset); w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 15: w[63] = amd_bytealign (w[47], w[48], offset); w[62] = amd_bytealign (w[46], w[47], offset); w[61] = amd_bytealign (w[45], w[46], offset); w[60] = amd_bytealign (w[44], w[45], offset); w[59] = amd_bytealign (w[43], w[44], offset); w[58] = amd_bytealign (w[42], w[43], offset); w[57] = amd_bytealign (w[41], w[42], offset); w[56] = amd_bytealign (w[40], w[41], offset); w[55] = amd_bytealign (w[39], w[40], offset); w[54] = amd_bytealign (w[38], w[39], offset); w[53] = amd_bytealign (w[37], w[38], offset); w[52] = amd_bytealign (w[36], w[37], offset); w[51] = amd_bytealign (w[35], w[36], offset); w[50] = amd_bytealign (w[34], w[35], offset); w[49] = amd_bytealign (w[33], w[34], offset); w[48] = amd_bytealign (w[32], w[33], offset); w[47] = amd_bytealign (w[31], w[32], offset); w[46] = amd_bytealign (w[30], w[31], offset); w[45] = amd_bytealign (w[29], w[30], offset); w[44] = amd_bytealign (w[28], w[29], offset); w[43] = amd_bytealign (w[27], w[28], offset); w[42] = amd_bytealign (w[26], w[27], offset); w[41] = amd_bytealign (w[25], w[26], offset); w[40] = amd_bytealign (w[24], w[25], offset); w[39] = amd_bytealign (w[23], w[24], offset); w[38] = amd_bytealign (w[22], w[23], offset); w[37] = amd_bytealign (w[21], w[22], offset); w[36] = amd_bytealign (w[20], w[21], offset); w[35] = amd_bytealign (w[19], w[20], offset); w[34] = amd_bytealign (w[18], w[19], offset); w[33] = amd_bytealign (w[17], w[18], offset); w[32] = amd_bytealign (w[16], w[17], offset); w[31] = amd_bytealign (w[15], w[16], offset); w[30] = amd_bytealign (w[14], w[15], offset); w[29] = amd_bytealign (w[13], w[14], offset); w[28] = amd_bytealign (w[12], w[13], offset); w[27] = amd_bytealign (w[11], w[12], offset); w[26] = amd_bytealign (w[10], w[11], offset); w[25] = amd_bytealign (w[ 9], w[10], offset); w[24] = amd_bytealign (w[ 8], w[ 9], offset); w[23] = amd_bytealign (w[ 7], w[ 8], offset); w[22] = amd_bytealign (w[ 6], w[ 7], offset); w[21] = amd_bytealign (w[ 5], w[ 6], offset); w[20] = amd_bytealign (w[ 4], w[ 5], offset); w[19] = amd_bytealign (w[ 3], w[ 4], offset); w[18] = amd_bytealign (w[ 2], w[ 3], offset); w[17] = amd_bytealign (w[ 1], w[ 2], offset); w[16] = amd_bytealign (w[ 0], w[ 1], offset); w[15] = amd_bytealign ( 0, w[ 0], offset); w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 16: w[63] = amd_bytealign (w[46], w[47], offset); w[62] = amd_bytealign (w[45], w[46], offset); w[61] = amd_bytealign (w[44], w[45], offset); w[60] = amd_bytealign (w[43], w[44], offset); w[59] = amd_bytealign (w[42], w[43], offset); w[58] = amd_bytealign (w[41], w[42], offset); w[57] = amd_bytealign (w[40], w[41], offset); w[56] = amd_bytealign (w[39], w[40], offset); w[55] = amd_bytealign (w[38], w[39], offset); w[54] = amd_bytealign (w[37], w[38], offset); w[53] = amd_bytealign (w[36], w[37], offset); w[52] = amd_bytealign (w[35], w[36], offset); w[51] = amd_bytealign (w[34], w[35], offset); w[50] = amd_bytealign (w[33], w[34], offset); w[49] = amd_bytealign (w[32], w[33], offset); w[48] = amd_bytealign (w[31], w[32], offset); w[47] = amd_bytealign (w[30], w[31], offset); w[46] = amd_bytealign (w[29], w[30], offset); w[45] = amd_bytealign (w[28], w[29], offset); w[44] = amd_bytealign (w[27], w[28], offset); w[43] = amd_bytealign (w[26], w[27], offset); w[42] = amd_bytealign (w[25], w[26], offset); w[41] = amd_bytealign (w[24], w[25], offset); w[40] = amd_bytealign (w[23], w[24], offset); w[39] = amd_bytealign (w[22], w[23], offset); w[38] = amd_bytealign (w[21], w[22], offset); w[37] = amd_bytealign (w[20], w[21], offset); w[36] = amd_bytealign (w[19], w[20], offset); w[35] = amd_bytealign (w[18], w[19], offset); w[34] = amd_bytealign (w[17], w[18], offset); w[33] = amd_bytealign (w[16], w[17], offset); w[32] = amd_bytealign (w[15], w[16], offset); w[31] = amd_bytealign (w[14], w[15], offset); w[30] = amd_bytealign (w[13], w[14], offset); w[29] = amd_bytealign (w[12], w[13], offset); w[28] = amd_bytealign (w[11], w[12], offset); w[27] = amd_bytealign (w[10], w[11], offset); w[26] = amd_bytealign (w[ 9], w[10], offset); w[25] = amd_bytealign (w[ 8], w[ 9], offset); w[24] = amd_bytealign (w[ 7], w[ 8], offset); w[23] = amd_bytealign (w[ 6], w[ 7], offset); w[22] = amd_bytealign (w[ 5], w[ 6], offset); w[21] = amd_bytealign (w[ 4], w[ 5], offset); w[20] = amd_bytealign (w[ 3], w[ 4], offset); w[19] = amd_bytealign (w[ 2], w[ 3], offset); w[18] = amd_bytealign (w[ 1], w[ 2], offset); w[17] = amd_bytealign (w[ 0], w[ 1], offset); w[16] = amd_bytealign ( 0, w[ 0], offset); w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 17: w[63] = amd_bytealign (w[45], w[46], offset); w[62] = amd_bytealign (w[44], w[45], offset); w[61] = amd_bytealign (w[43], w[44], offset); w[60] = amd_bytealign (w[42], w[43], offset); w[59] = amd_bytealign (w[41], w[42], offset); w[58] = amd_bytealign (w[40], w[41], offset); w[57] = amd_bytealign (w[39], w[40], offset); w[56] = amd_bytealign (w[38], w[39], offset); w[55] = amd_bytealign (w[37], w[38], offset); w[54] = amd_bytealign (w[36], w[37], offset); w[53] = amd_bytealign (w[35], w[36], offset); w[52] = amd_bytealign (w[34], w[35], offset); w[51] = amd_bytealign (w[33], w[34], offset); w[50] = amd_bytealign (w[32], w[33], offset); w[49] = amd_bytealign (w[31], w[32], offset); w[48] = amd_bytealign (w[30], w[31], offset); w[47] = amd_bytealign (w[29], w[30], offset); w[46] = amd_bytealign (w[28], w[29], offset); w[45] = amd_bytealign (w[27], w[28], offset); w[44] = amd_bytealign (w[26], w[27], offset); w[43] = amd_bytealign (w[25], w[26], offset); w[42] = amd_bytealign (w[24], w[25], offset); w[41] = amd_bytealign (w[23], w[24], offset); w[40] = amd_bytealign (w[22], w[23], offset); w[39] = amd_bytealign (w[21], w[22], offset); w[38] = amd_bytealign (w[20], w[21], offset); w[37] = amd_bytealign (w[19], w[20], offset); w[36] = amd_bytealign (w[18], w[19], offset); w[35] = amd_bytealign (w[17], w[18], offset); w[34] = amd_bytealign (w[16], w[17], offset); w[33] = amd_bytealign (w[15], w[16], offset); w[32] = amd_bytealign (w[14], w[15], offset); w[31] = amd_bytealign (w[13], w[14], offset); w[30] = amd_bytealign (w[12], w[13], offset); w[29] = amd_bytealign (w[11], w[12], offset); w[28] = amd_bytealign (w[10], w[11], offset); w[27] = amd_bytealign (w[ 9], w[10], offset); w[26] = amd_bytealign (w[ 8], w[ 9], offset); w[25] = amd_bytealign (w[ 7], w[ 8], offset); w[24] = amd_bytealign (w[ 6], w[ 7], offset); w[23] = amd_bytealign (w[ 5], w[ 6], offset); w[22] = amd_bytealign (w[ 4], w[ 5], offset); w[21] = amd_bytealign (w[ 3], w[ 4], offset); w[20] = amd_bytealign (w[ 2], w[ 3], offset); w[19] = amd_bytealign (w[ 1], w[ 2], offset); w[18] = amd_bytealign (w[ 0], w[ 1], offset); w[17] = amd_bytealign ( 0, w[ 0], offset); w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 18: w[63] = amd_bytealign (w[44], w[45], offset); w[62] = amd_bytealign (w[43], w[44], offset); w[61] = amd_bytealign (w[42], w[43], offset); w[60] = amd_bytealign (w[41], w[42], offset); w[59] = amd_bytealign (w[40], w[41], offset); w[58] = amd_bytealign (w[39], w[40], offset); w[57] = amd_bytealign (w[38], w[39], offset); w[56] = amd_bytealign (w[37], w[38], offset); w[55] = amd_bytealign (w[36], w[37], offset); w[54] = amd_bytealign (w[35], w[36], offset); w[53] = amd_bytealign (w[34], w[35], offset); w[52] = amd_bytealign (w[33], w[34], offset); w[51] = amd_bytealign (w[32], w[33], offset); w[50] = amd_bytealign (w[31], w[32], offset); w[49] = amd_bytealign (w[30], w[31], offset); w[48] = amd_bytealign (w[29], w[30], offset); w[47] = amd_bytealign (w[28], w[29], offset); w[46] = amd_bytealign (w[27], w[28], offset); w[45] = amd_bytealign (w[26], w[27], offset); w[44] = amd_bytealign (w[25], w[26], offset); w[43] = amd_bytealign (w[24], w[25], offset); w[42] = amd_bytealign (w[23], w[24], offset); w[41] = amd_bytealign (w[22], w[23], offset); w[40] = amd_bytealign (w[21], w[22], offset); w[39] = amd_bytealign (w[20], w[21], offset); w[38] = amd_bytealign (w[19], w[20], offset); w[37] = amd_bytealign (w[18], w[19], offset); w[36] = amd_bytealign (w[17], w[18], offset); w[35] = amd_bytealign (w[16], w[17], offset); w[34] = amd_bytealign (w[15], w[16], offset); w[33] = amd_bytealign (w[14], w[15], offset); w[32] = amd_bytealign (w[13], w[14], offset); w[31] = amd_bytealign (w[12], w[13], offset); w[30] = amd_bytealign (w[11], w[12], offset); w[29] = amd_bytealign (w[10], w[11], offset); w[28] = amd_bytealign (w[ 9], w[10], offset); w[27] = amd_bytealign (w[ 8], w[ 9], offset); w[26] = amd_bytealign (w[ 7], w[ 8], offset); w[25] = amd_bytealign (w[ 6], w[ 7], offset); w[24] = amd_bytealign (w[ 5], w[ 6], offset); w[23] = amd_bytealign (w[ 4], w[ 5], offset); w[22] = amd_bytealign (w[ 3], w[ 4], offset); w[21] = amd_bytealign (w[ 2], w[ 3], offset); w[20] = amd_bytealign (w[ 1], w[ 2], offset); w[19] = amd_bytealign (w[ 0], w[ 1], offset); w[18] = amd_bytealign ( 0, w[ 0], offset); w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 19: w[63] = amd_bytealign (w[43], w[44], offset); w[62] = amd_bytealign (w[42], w[43], offset); w[61] = amd_bytealign (w[41], w[42], offset); w[60] = amd_bytealign (w[40], w[41], offset); w[59] = amd_bytealign (w[39], w[40], offset); w[58] = amd_bytealign (w[38], w[39], offset); w[57] = amd_bytealign (w[37], w[38], offset); w[56] = amd_bytealign (w[36], w[37], offset); w[55] = amd_bytealign (w[35], w[36], offset); w[54] = amd_bytealign (w[34], w[35], offset); w[53] = amd_bytealign (w[33], w[34], offset); w[52] = amd_bytealign (w[32], w[33], offset); w[51] = amd_bytealign (w[31], w[32], offset); w[50] = amd_bytealign (w[30], w[31], offset); w[49] = amd_bytealign (w[29], w[30], offset); w[48] = amd_bytealign (w[28], w[29], offset); w[47] = amd_bytealign (w[27], w[28], offset); w[46] = amd_bytealign (w[26], w[27], offset); w[45] = amd_bytealign (w[25], w[26], offset); w[44] = amd_bytealign (w[24], w[25], offset); w[43] = amd_bytealign (w[23], w[24], offset); w[42] = amd_bytealign (w[22], w[23], offset); w[41] = amd_bytealign (w[21], w[22], offset); w[40] = amd_bytealign (w[20], w[21], offset); w[39] = amd_bytealign (w[19], w[20], offset); w[38] = amd_bytealign (w[18], w[19], offset); w[37] = amd_bytealign (w[17], w[18], offset); w[36] = amd_bytealign (w[16], w[17], offset); w[35] = amd_bytealign (w[15], w[16], offset); w[34] = amd_bytealign (w[14], w[15], offset); w[33] = amd_bytealign (w[13], w[14], offset); w[32] = amd_bytealign (w[12], w[13], offset); w[31] = amd_bytealign (w[11], w[12], offset); w[30] = amd_bytealign (w[10], w[11], offset); w[29] = amd_bytealign (w[ 9], w[10], offset); w[28] = amd_bytealign (w[ 8], w[ 9], offset); w[27] = amd_bytealign (w[ 7], w[ 8], offset); w[26] = amd_bytealign (w[ 6], w[ 7], offset); w[25] = amd_bytealign (w[ 5], w[ 6], offset); w[24] = amd_bytealign (w[ 4], w[ 5], offset); w[23] = amd_bytealign (w[ 3], w[ 4], offset); w[22] = amd_bytealign (w[ 2], w[ 3], offset); w[21] = amd_bytealign (w[ 1], w[ 2], offset); w[20] = amd_bytealign (w[ 0], w[ 1], offset); w[19] = amd_bytealign ( 0, w[ 0], offset); w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 20: w[63] = amd_bytealign (w[42], w[43], offset); w[62] = amd_bytealign (w[41], w[42], offset); w[61] = amd_bytealign (w[40], w[41], offset); w[60] = amd_bytealign (w[39], w[40], offset); w[59] = amd_bytealign (w[38], w[39], offset); w[58] = amd_bytealign (w[37], w[38], offset); w[57] = amd_bytealign (w[36], w[37], offset); w[56] = amd_bytealign (w[35], w[36], offset); w[55] = amd_bytealign (w[34], w[35], offset); w[54] = amd_bytealign (w[33], w[34], offset); w[53] = amd_bytealign (w[32], w[33], offset); w[52] = amd_bytealign (w[31], w[32], offset); w[51] = amd_bytealign (w[30], w[31], offset); w[50] = amd_bytealign (w[29], w[30], offset); w[49] = amd_bytealign (w[28], w[29], offset); w[48] = amd_bytealign (w[27], w[28], offset); w[47] = amd_bytealign (w[26], w[27], offset); w[46] = amd_bytealign (w[25], w[26], offset); w[45] = amd_bytealign (w[24], w[25], offset); w[44] = amd_bytealign (w[23], w[24], offset); w[43] = amd_bytealign (w[22], w[23], offset); w[42] = amd_bytealign (w[21], w[22], offset); w[41] = amd_bytealign (w[20], w[21], offset); w[40] = amd_bytealign (w[19], w[20], offset); w[39] = amd_bytealign (w[18], w[19], offset); w[38] = amd_bytealign (w[17], w[18], offset); w[37] = amd_bytealign (w[16], w[17], offset); w[36] = amd_bytealign (w[15], w[16], offset); w[35] = amd_bytealign (w[14], w[15], offset); w[34] = amd_bytealign (w[13], w[14], offset); w[33] = amd_bytealign (w[12], w[13], offset); w[32] = amd_bytealign (w[11], w[12], offset); w[31] = amd_bytealign (w[10], w[11], offset); w[30] = amd_bytealign (w[ 9], w[10], offset); w[29] = amd_bytealign (w[ 8], w[ 9], offset); w[28] = amd_bytealign (w[ 7], w[ 8], offset); w[27] = amd_bytealign (w[ 6], w[ 7], offset); w[26] = amd_bytealign (w[ 5], w[ 6], offset); w[25] = amd_bytealign (w[ 4], w[ 5], offset); w[24] = amd_bytealign (w[ 3], w[ 4], offset); w[23] = amd_bytealign (w[ 2], w[ 3], offset); w[22] = amd_bytealign (w[ 1], w[ 2], offset); w[21] = amd_bytealign (w[ 0], w[ 1], offset); w[20] = amd_bytealign ( 0, w[ 0], offset); w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 21: w[63] = amd_bytealign (w[41], w[42], offset); w[62] = amd_bytealign (w[40], w[41], offset); w[61] = amd_bytealign (w[39], w[40], offset); w[60] = amd_bytealign (w[38], w[39], offset); w[59] = amd_bytealign (w[37], w[38], offset); w[58] = amd_bytealign (w[36], w[37], offset); w[57] = amd_bytealign (w[35], w[36], offset); w[56] = amd_bytealign (w[34], w[35], offset); w[55] = amd_bytealign (w[33], w[34], offset); w[54] = amd_bytealign (w[32], w[33], offset); w[53] = amd_bytealign (w[31], w[32], offset); w[52] = amd_bytealign (w[30], w[31], offset); w[51] = amd_bytealign (w[29], w[30], offset); w[50] = amd_bytealign (w[28], w[29], offset); w[49] = amd_bytealign (w[27], w[28], offset); w[48] = amd_bytealign (w[26], w[27], offset); w[47] = amd_bytealign (w[25], w[26], offset); w[46] = amd_bytealign (w[24], w[25], offset); w[45] = amd_bytealign (w[23], w[24], offset); w[44] = amd_bytealign (w[22], w[23], offset); w[43] = amd_bytealign (w[21], w[22], offset); w[42] = amd_bytealign (w[20], w[21], offset); w[41] = amd_bytealign (w[19], w[20], offset); w[40] = amd_bytealign (w[18], w[19], offset); w[39] = amd_bytealign (w[17], w[18], offset); w[38] = amd_bytealign (w[16], w[17], offset); w[37] = amd_bytealign (w[15], w[16], offset); w[36] = amd_bytealign (w[14], w[15], offset); w[35] = amd_bytealign (w[13], w[14], offset); w[34] = amd_bytealign (w[12], w[13], offset); w[33] = amd_bytealign (w[11], w[12], offset); w[32] = amd_bytealign (w[10], w[11], offset); w[31] = amd_bytealign (w[ 9], w[10], offset); w[30] = amd_bytealign (w[ 8], w[ 9], offset); w[29] = amd_bytealign (w[ 7], w[ 8], offset); w[28] = amd_bytealign (w[ 6], w[ 7], offset); w[27] = amd_bytealign (w[ 5], w[ 6], offset); w[26] = amd_bytealign (w[ 4], w[ 5], offset); w[25] = amd_bytealign (w[ 3], w[ 4], offset); w[24] = amd_bytealign (w[ 2], w[ 3], offset); w[23] = amd_bytealign (w[ 1], w[ 2], offset); w[22] = amd_bytealign (w[ 0], w[ 1], offset); w[21] = amd_bytealign ( 0, w[ 0], offset); w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 22: w[63] = amd_bytealign (w[40], w[41], offset); w[62] = amd_bytealign (w[39], w[40], offset); w[61] = amd_bytealign (w[38], w[39], offset); w[60] = amd_bytealign (w[37], w[38], offset); w[59] = amd_bytealign (w[36], w[37], offset); w[58] = amd_bytealign (w[35], w[36], offset); w[57] = amd_bytealign (w[34], w[35], offset); w[56] = amd_bytealign (w[33], w[34], offset); w[55] = amd_bytealign (w[32], w[33], offset); w[54] = amd_bytealign (w[31], w[32], offset); w[53] = amd_bytealign (w[30], w[31], offset); w[52] = amd_bytealign (w[29], w[30], offset); w[51] = amd_bytealign (w[28], w[29], offset); w[50] = amd_bytealign (w[27], w[28], offset); w[49] = amd_bytealign (w[26], w[27], offset); w[48] = amd_bytealign (w[25], w[26], offset); w[47] = amd_bytealign (w[24], w[25], offset); w[46] = amd_bytealign (w[23], w[24], offset); w[45] = amd_bytealign (w[22], w[23], offset); w[44] = amd_bytealign (w[21], w[22], offset); w[43] = amd_bytealign (w[20], w[21], offset); w[42] = amd_bytealign (w[19], w[20], offset); w[41] = amd_bytealign (w[18], w[19], offset); w[40] = amd_bytealign (w[17], w[18], offset); w[39] = amd_bytealign (w[16], w[17], offset); w[38] = amd_bytealign (w[15], w[16], offset); w[37] = amd_bytealign (w[14], w[15], offset); w[36] = amd_bytealign (w[13], w[14], offset); w[35] = amd_bytealign (w[12], w[13], offset); w[34] = amd_bytealign (w[11], w[12], offset); w[33] = amd_bytealign (w[10], w[11], offset); w[32] = amd_bytealign (w[ 9], w[10], offset); w[31] = amd_bytealign (w[ 8], w[ 9], offset); w[30] = amd_bytealign (w[ 7], w[ 8], offset); w[29] = amd_bytealign (w[ 6], w[ 7], offset); w[28] = amd_bytealign (w[ 5], w[ 6], offset); w[27] = amd_bytealign (w[ 4], w[ 5], offset); w[26] = amd_bytealign (w[ 3], w[ 4], offset); w[25] = amd_bytealign (w[ 2], w[ 3], offset); w[24] = amd_bytealign (w[ 1], w[ 2], offset); w[23] = amd_bytealign (w[ 0], w[ 1], offset); w[22] = amd_bytealign ( 0, w[ 0], offset); w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 23: w[63] = amd_bytealign (w[39], w[40], offset); w[62] = amd_bytealign (w[38], w[39], offset); w[61] = amd_bytealign (w[37], w[38], offset); w[60] = amd_bytealign (w[36], w[37], offset); w[59] = amd_bytealign (w[35], w[36], offset); w[58] = amd_bytealign (w[34], w[35], offset); w[57] = amd_bytealign (w[33], w[34], offset); w[56] = amd_bytealign (w[32], w[33], offset); w[55] = amd_bytealign (w[31], w[32], offset); w[54] = amd_bytealign (w[30], w[31], offset); w[53] = amd_bytealign (w[29], w[30], offset); w[52] = amd_bytealign (w[28], w[29], offset); w[51] = amd_bytealign (w[27], w[28], offset); w[50] = amd_bytealign (w[26], w[27], offset); w[49] = amd_bytealign (w[25], w[26], offset); w[48] = amd_bytealign (w[24], w[25], offset); w[47] = amd_bytealign (w[23], w[24], offset); w[46] = amd_bytealign (w[22], w[23], offset); w[45] = amd_bytealign (w[21], w[22], offset); w[44] = amd_bytealign (w[20], w[21], offset); w[43] = amd_bytealign (w[19], w[20], offset); w[42] = amd_bytealign (w[18], w[19], offset); w[41] = amd_bytealign (w[17], w[18], offset); w[40] = amd_bytealign (w[16], w[17], offset); w[39] = amd_bytealign (w[15], w[16], offset); w[38] = amd_bytealign (w[14], w[15], offset); w[37] = amd_bytealign (w[13], w[14], offset); w[36] = amd_bytealign (w[12], w[13], offset); w[35] = amd_bytealign (w[11], w[12], offset); w[34] = amd_bytealign (w[10], w[11], offset); w[33] = amd_bytealign (w[ 9], w[10], offset); w[32] = amd_bytealign (w[ 8], w[ 9], offset); w[31] = amd_bytealign (w[ 7], w[ 8], offset); w[30] = amd_bytealign (w[ 6], w[ 7], offset); w[29] = amd_bytealign (w[ 5], w[ 6], offset); w[28] = amd_bytealign (w[ 4], w[ 5], offset); w[27] = amd_bytealign (w[ 3], w[ 4], offset); w[26] = amd_bytealign (w[ 2], w[ 3], offset); w[25] = amd_bytealign (w[ 1], w[ 2], offset); w[24] = amd_bytealign (w[ 0], w[ 1], offset); w[23] = amd_bytealign ( 0, w[ 0], offset); w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 24: w[63] = amd_bytealign (w[38], w[39], offset); w[62] = amd_bytealign (w[37], w[38], offset); w[61] = amd_bytealign (w[36], w[37], offset); w[60] = amd_bytealign (w[35], w[36], offset); w[59] = amd_bytealign (w[34], w[35], offset); w[58] = amd_bytealign (w[33], w[34], offset); w[57] = amd_bytealign (w[32], w[33], offset); w[56] = amd_bytealign (w[31], w[32], offset); w[55] = amd_bytealign (w[30], w[31], offset); w[54] = amd_bytealign (w[29], w[30], offset); w[53] = amd_bytealign (w[28], w[29], offset); w[52] = amd_bytealign (w[27], w[28], offset); w[51] = amd_bytealign (w[26], w[27], offset); w[50] = amd_bytealign (w[25], w[26], offset); w[49] = amd_bytealign (w[24], w[25], offset); w[48] = amd_bytealign (w[23], w[24], offset); w[47] = amd_bytealign (w[22], w[23], offset); w[46] = amd_bytealign (w[21], w[22], offset); w[45] = amd_bytealign (w[20], w[21], offset); w[44] = amd_bytealign (w[19], w[20], offset); w[43] = amd_bytealign (w[18], w[19], offset); w[42] = amd_bytealign (w[17], w[18], offset); w[41] = amd_bytealign (w[16], w[17], offset); w[40] = amd_bytealign (w[15], w[16], offset); w[39] = amd_bytealign (w[14], w[15], offset); w[38] = amd_bytealign (w[13], w[14], offset); w[37] = amd_bytealign (w[12], w[13], offset); w[36] = amd_bytealign (w[11], w[12], offset); w[35] = amd_bytealign (w[10], w[11], offset); w[34] = amd_bytealign (w[ 9], w[10], offset); w[33] = amd_bytealign (w[ 8], w[ 9], offset); w[32] = amd_bytealign (w[ 7], w[ 8], offset); w[31] = amd_bytealign (w[ 6], w[ 7], offset); w[30] = amd_bytealign (w[ 5], w[ 6], offset); w[29] = amd_bytealign (w[ 4], w[ 5], offset); w[28] = amd_bytealign (w[ 3], w[ 4], offset); w[27] = amd_bytealign (w[ 2], w[ 3], offset); w[26] = amd_bytealign (w[ 1], w[ 2], offset); w[25] = amd_bytealign (w[ 0], w[ 1], offset); w[24] = amd_bytealign ( 0, w[ 0], offset); w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 25: w[63] = amd_bytealign (w[37], w[38], offset); w[62] = amd_bytealign (w[36], w[37], offset); w[61] = amd_bytealign (w[35], w[36], offset); w[60] = amd_bytealign (w[34], w[35], offset); w[59] = amd_bytealign (w[33], w[34], offset); w[58] = amd_bytealign (w[32], w[33], offset); w[57] = amd_bytealign (w[31], w[32], offset); w[56] = amd_bytealign (w[30], w[31], offset); w[55] = amd_bytealign (w[29], w[30], offset); w[54] = amd_bytealign (w[28], w[29], offset); w[53] = amd_bytealign (w[27], w[28], offset); w[52] = amd_bytealign (w[26], w[27], offset); w[51] = amd_bytealign (w[25], w[26], offset); w[50] = amd_bytealign (w[24], w[25], offset); w[49] = amd_bytealign (w[23], w[24], offset); w[48] = amd_bytealign (w[22], w[23], offset); w[47] = amd_bytealign (w[21], w[22], offset); w[46] = amd_bytealign (w[20], w[21], offset); w[45] = amd_bytealign (w[19], w[20], offset); w[44] = amd_bytealign (w[18], w[19], offset); w[43] = amd_bytealign (w[17], w[18], offset); w[42] = amd_bytealign (w[16], w[17], offset); w[41] = amd_bytealign (w[15], w[16], offset); w[40] = amd_bytealign (w[14], w[15], offset); w[39] = amd_bytealign (w[13], w[14], offset); w[38] = amd_bytealign (w[12], w[13], offset); w[37] = amd_bytealign (w[11], w[12], offset); w[36] = amd_bytealign (w[10], w[11], offset); w[35] = amd_bytealign (w[ 9], w[10], offset); w[34] = amd_bytealign (w[ 8], w[ 9], offset); w[33] = amd_bytealign (w[ 7], w[ 8], offset); w[32] = amd_bytealign (w[ 6], w[ 7], offset); w[31] = amd_bytealign (w[ 5], w[ 6], offset); w[30] = amd_bytealign (w[ 4], w[ 5], offset); w[29] = amd_bytealign (w[ 3], w[ 4], offset); w[28] = amd_bytealign (w[ 2], w[ 3], offset); w[27] = amd_bytealign (w[ 1], w[ 2], offset); w[26] = amd_bytealign (w[ 0], w[ 1], offset); w[25] = amd_bytealign ( 0, w[ 0], offset); w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 26: w[63] = amd_bytealign (w[36], w[37], offset); w[62] = amd_bytealign (w[35], w[36], offset); w[61] = amd_bytealign (w[34], w[35], offset); w[60] = amd_bytealign (w[33], w[34], offset); w[59] = amd_bytealign (w[32], w[33], offset); w[58] = amd_bytealign (w[31], w[32], offset); w[57] = amd_bytealign (w[30], w[31], offset); w[56] = amd_bytealign (w[29], w[30], offset); w[55] = amd_bytealign (w[28], w[29], offset); w[54] = amd_bytealign (w[27], w[28], offset); w[53] = amd_bytealign (w[26], w[27], offset); w[52] = amd_bytealign (w[25], w[26], offset); w[51] = amd_bytealign (w[24], w[25], offset); w[50] = amd_bytealign (w[23], w[24], offset); w[49] = amd_bytealign (w[22], w[23], offset); w[48] = amd_bytealign (w[21], w[22], offset); w[47] = amd_bytealign (w[20], w[21], offset); w[46] = amd_bytealign (w[19], w[20], offset); w[45] = amd_bytealign (w[18], w[19], offset); w[44] = amd_bytealign (w[17], w[18], offset); w[43] = amd_bytealign (w[16], w[17], offset); w[42] = amd_bytealign (w[15], w[16], offset); w[41] = amd_bytealign (w[14], w[15], offset); w[40] = amd_bytealign (w[13], w[14], offset); w[39] = amd_bytealign (w[12], w[13], offset); w[38] = amd_bytealign (w[11], w[12], offset); w[37] = amd_bytealign (w[10], w[11], offset); w[36] = amd_bytealign (w[ 9], w[10], offset); w[35] = amd_bytealign (w[ 8], w[ 9], offset); w[34] = amd_bytealign (w[ 7], w[ 8], offset); w[33] = amd_bytealign (w[ 6], w[ 7], offset); w[32] = amd_bytealign (w[ 5], w[ 6], offset); w[31] = amd_bytealign (w[ 4], w[ 5], offset); w[30] = amd_bytealign (w[ 3], w[ 4], offset); w[29] = amd_bytealign (w[ 2], w[ 3], offset); w[28] = amd_bytealign (w[ 1], w[ 2], offset); w[27] = amd_bytealign (w[ 0], w[ 1], offset); w[26] = amd_bytealign ( 0, w[ 0], offset); w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 27: w[63] = amd_bytealign (w[35], w[36], offset); w[62] = amd_bytealign (w[34], w[35], offset); w[61] = amd_bytealign (w[33], w[34], offset); w[60] = amd_bytealign (w[32], w[33], offset); w[59] = amd_bytealign (w[31], w[32], offset); w[58] = amd_bytealign (w[30], w[31], offset); w[57] = amd_bytealign (w[29], w[30], offset); w[56] = amd_bytealign (w[28], w[29], offset); w[55] = amd_bytealign (w[27], w[28], offset); w[54] = amd_bytealign (w[26], w[27], offset); w[53] = amd_bytealign (w[25], w[26], offset); w[52] = amd_bytealign (w[24], w[25], offset); w[51] = amd_bytealign (w[23], w[24], offset); w[50] = amd_bytealign (w[22], w[23], offset); w[49] = amd_bytealign (w[21], w[22], offset); w[48] = amd_bytealign (w[20], w[21], offset); w[47] = amd_bytealign (w[19], w[20], offset); w[46] = amd_bytealign (w[18], w[19], offset); w[45] = amd_bytealign (w[17], w[18], offset); w[44] = amd_bytealign (w[16], w[17], offset); w[43] = amd_bytealign (w[15], w[16], offset); w[42] = amd_bytealign (w[14], w[15], offset); w[41] = amd_bytealign (w[13], w[14], offset); w[40] = amd_bytealign (w[12], w[13], offset); w[39] = amd_bytealign (w[11], w[12], offset); w[38] = amd_bytealign (w[10], w[11], offset); w[37] = amd_bytealign (w[ 9], w[10], offset); w[36] = amd_bytealign (w[ 8], w[ 9], offset); w[35] = amd_bytealign (w[ 7], w[ 8], offset); w[34] = amd_bytealign (w[ 6], w[ 7], offset); w[33] = amd_bytealign (w[ 5], w[ 6], offset); w[32] = amd_bytealign (w[ 4], w[ 5], offset); w[31] = amd_bytealign (w[ 3], w[ 4], offset); w[30] = amd_bytealign (w[ 2], w[ 3], offset); w[29] = amd_bytealign (w[ 1], w[ 2], offset); w[28] = amd_bytealign (w[ 0], w[ 1], offset); w[27] = amd_bytealign ( 0, w[ 0], offset); w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 28: w[63] = amd_bytealign (w[34], w[35], offset); w[62] = amd_bytealign (w[33], w[34], offset); w[61] = amd_bytealign (w[32], w[33], offset); w[60] = amd_bytealign (w[31], w[32], offset); w[59] = amd_bytealign (w[30], w[31], offset); w[58] = amd_bytealign (w[29], w[30], offset); w[57] = amd_bytealign (w[28], w[29], offset); w[56] = amd_bytealign (w[27], w[28], offset); w[55] = amd_bytealign (w[26], w[27], offset); w[54] = amd_bytealign (w[25], w[26], offset); w[53] = amd_bytealign (w[24], w[25], offset); w[52] = amd_bytealign (w[23], w[24], offset); w[51] = amd_bytealign (w[22], w[23], offset); w[50] = amd_bytealign (w[21], w[22], offset); w[49] = amd_bytealign (w[20], w[21], offset); w[48] = amd_bytealign (w[19], w[20], offset); w[47] = amd_bytealign (w[18], w[19], offset); w[46] = amd_bytealign (w[17], w[18], offset); w[45] = amd_bytealign (w[16], w[17], offset); w[44] = amd_bytealign (w[15], w[16], offset); w[43] = amd_bytealign (w[14], w[15], offset); w[42] = amd_bytealign (w[13], w[14], offset); w[41] = amd_bytealign (w[12], w[13], offset); w[40] = amd_bytealign (w[11], w[12], offset); w[39] = amd_bytealign (w[10], w[11], offset); w[38] = amd_bytealign (w[ 9], w[10], offset); w[37] = amd_bytealign (w[ 8], w[ 9], offset); w[36] = amd_bytealign (w[ 7], w[ 8], offset); w[35] = amd_bytealign (w[ 6], w[ 7], offset); w[34] = amd_bytealign (w[ 5], w[ 6], offset); w[33] = amd_bytealign (w[ 4], w[ 5], offset); w[32] = amd_bytealign (w[ 3], w[ 4], offset); w[31] = amd_bytealign (w[ 2], w[ 3], offset); w[30] = amd_bytealign (w[ 1], w[ 2], offset); w[29] = amd_bytealign (w[ 0], w[ 1], offset); w[28] = amd_bytealign ( 0, w[ 0], offset); w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 29: w[63] = amd_bytealign (w[33], w[34], offset); w[62] = amd_bytealign (w[32], w[33], offset); w[61] = amd_bytealign (w[31], w[32], offset); w[60] = amd_bytealign (w[30], w[31], offset); w[59] = amd_bytealign (w[29], w[30], offset); w[58] = amd_bytealign (w[28], w[29], offset); w[57] = amd_bytealign (w[27], w[28], offset); w[56] = amd_bytealign (w[26], w[27], offset); w[55] = amd_bytealign (w[25], w[26], offset); w[54] = amd_bytealign (w[24], w[25], offset); w[53] = amd_bytealign (w[23], w[24], offset); w[52] = amd_bytealign (w[22], w[23], offset); w[51] = amd_bytealign (w[21], w[22], offset); w[50] = amd_bytealign (w[20], w[21], offset); w[49] = amd_bytealign (w[19], w[20], offset); w[48] = amd_bytealign (w[18], w[19], offset); w[47] = amd_bytealign (w[17], w[18], offset); w[46] = amd_bytealign (w[16], w[17], offset); w[45] = amd_bytealign (w[15], w[16], offset); w[44] = amd_bytealign (w[14], w[15], offset); w[43] = amd_bytealign (w[13], w[14], offset); w[42] = amd_bytealign (w[12], w[13], offset); w[41] = amd_bytealign (w[11], w[12], offset); w[40] = amd_bytealign (w[10], w[11], offset); w[39] = amd_bytealign (w[ 9], w[10], offset); w[38] = amd_bytealign (w[ 8], w[ 9], offset); w[37] = amd_bytealign (w[ 7], w[ 8], offset); w[36] = amd_bytealign (w[ 6], w[ 7], offset); w[35] = amd_bytealign (w[ 5], w[ 6], offset); w[34] = amd_bytealign (w[ 4], w[ 5], offset); w[33] = amd_bytealign (w[ 3], w[ 4], offset); w[32] = amd_bytealign (w[ 2], w[ 3], offset); w[31] = amd_bytealign (w[ 1], w[ 2], offset); w[30] = amd_bytealign (w[ 0], w[ 1], offset); w[29] = amd_bytealign ( 0, w[ 0], offset); w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 30: w[63] = amd_bytealign (w[32], w[33], offset); w[62] = amd_bytealign (w[31], w[32], offset); w[61] = amd_bytealign (w[30], w[31], offset); w[60] = amd_bytealign (w[29], w[30], offset); w[59] = amd_bytealign (w[28], w[29], offset); w[58] = amd_bytealign (w[27], w[28], offset); w[57] = amd_bytealign (w[26], w[27], offset); w[56] = amd_bytealign (w[25], w[26], offset); w[55] = amd_bytealign (w[24], w[25], offset); w[54] = amd_bytealign (w[23], w[24], offset); w[53] = amd_bytealign (w[22], w[23], offset); w[52] = amd_bytealign (w[21], w[22], offset); w[51] = amd_bytealign (w[20], w[21], offset); w[50] = amd_bytealign (w[19], w[20], offset); w[49] = amd_bytealign (w[18], w[19], offset); w[48] = amd_bytealign (w[17], w[18], offset); w[47] = amd_bytealign (w[16], w[17], offset); w[46] = amd_bytealign (w[15], w[16], offset); w[45] = amd_bytealign (w[14], w[15], offset); w[44] = amd_bytealign (w[13], w[14], offset); w[43] = amd_bytealign (w[12], w[13], offset); w[42] = amd_bytealign (w[11], w[12], offset); w[41] = amd_bytealign (w[10], w[11], offset); w[40] = amd_bytealign (w[ 9], w[10], offset); w[39] = amd_bytealign (w[ 8], w[ 9], offset); w[38] = amd_bytealign (w[ 7], w[ 8], offset); w[37] = amd_bytealign (w[ 6], w[ 7], offset); w[36] = amd_bytealign (w[ 5], w[ 6], offset); w[35] = amd_bytealign (w[ 4], w[ 5], offset); w[34] = amd_bytealign (w[ 3], w[ 4], offset); w[33] = amd_bytealign (w[ 2], w[ 3], offset); w[32] = amd_bytealign (w[ 1], w[ 2], offset); w[31] = amd_bytealign (w[ 0], w[ 1], offset); w[30] = amd_bytealign ( 0, w[ 0], offset); w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 31: w[63] = amd_bytealign (w[31], w[32], offset); w[62] = amd_bytealign (w[30], w[31], offset); w[61] = amd_bytealign (w[29], w[30], offset); w[60] = amd_bytealign (w[28], w[29], offset); w[59] = amd_bytealign (w[27], w[28], offset); w[58] = amd_bytealign (w[26], w[27], offset); w[57] = amd_bytealign (w[25], w[26], offset); w[56] = amd_bytealign (w[24], w[25], offset); w[55] = amd_bytealign (w[23], w[24], offset); w[54] = amd_bytealign (w[22], w[23], offset); w[53] = amd_bytealign (w[21], w[22], offset); w[52] = amd_bytealign (w[20], w[21], offset); w[51] = amd_bytealign (w[19], w[20], offset); w[50] = amd_bytealign (w[18], w[19], offset); w[49] = amd_bytealign (w[17], w[18], offset); w[48] = amd_bytealign (w[16], w[17], offset); w[47] = amd_bytealign (w[15], w[16], offset); w[46] = amd_bytealign (w[14], w[15], offset); w[45] = amd_bytealign (w[13], w[14], offset); w[44] = amd_bytealign (w[12], w[13], offset); w[43] = amd_bytealign (w[11], w[12], offset); w[42] = amd_bytealign (w[10], w[11], offset); w[41] = amd_bytealign (w[ 9], w[10], offset); w[40] = amd_bytealign (w[ 8], w[ 9], offset); w[39] = amd_bytealign (w[ 7], w[ 8], offset); w[38] = amd_bytealign (w[ 6], w[ 7], offset); w[37] = amd_bytealign (w[ 5], w[ 6], offset); w[36] = amd_bytealign (w[ 4], w[ 5], offset); w[35] = amd_bytealign (w[ 3], w[ 4], offset); w[34] = amd_bytealign (w[ 2], w[ 3], offset); w[33] = amd_bytealign (w[ 1], w[ 2], offset); w[32] = amd_bytealign (w[ 0], w[ 1], offset); w[31] = amd_bytealign ( 0, w[ 0], offset); w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 32: w[63] = amd_bytealign (w[30], w[31], offset); w[62] = amd_bytealign (w[29], w[30], offset); w[61] = amd_bytealign (w[28], w[29], offset); w[60] = amd_bytealign (w[27], w[28], offset); w[59] = amd_bytealign (w[26], w[27], offset); w[58] = amd_bytealign (w[25], w[26], offset); w[57] = amd_bytealign (w[24], w[25], offset); w[56] = amd_bytealign (w[23], w[24], offset); w[55] = amd_bytealign (w[22], w[23], offset); w[54] = amd_bytealign (w[21], w[22], offset); w[53] = amd_bytealign (w[20], w[21], offset); w[52] = amd_bytealign (w[19], w[20], offset); w[51] = amd_bytealign (w[18], w[19], offset); w[50] = amd_bytealign (w[17], w[18], offset); w[49] = amd_bytealign (w[16], w[17], offset); w[48] = amd_bytealign (w[15], w[16], offset); w[47] = amd_bytealign (w[14], w[15], offset); w[46] = amd_bytealign (w[13], w[14], offset); w[45] = amd_bytealign (w[12], w[13], offset); w[44] = amd_bytealign (w[11], w[12], offset); w[43] = amd_bytealign (w[10], w[11], offset); w[42] = amd_bytealign (w[ 9], w[10], offset); w[41] = amd_bytealign (w[ 8], w[ 9], offset); w[40] = amd_bytealign (w[ 7], w[ 8], offset); w[39] = amd_bytealign (w[ 6], w[ 7], offset); w[38] = amd_bytealign (w[ 5], w[ 6], offset); w[37] = amd_bytealign (w[ 4], w[ 5], offset); w[36] = amd_bytealign (w[ 3], w[ 4], offset); w[35] = amd_bytealign (w[ 2], w[ 3], offset); w[34] = amd_bytealign (w[ 1], w[ 2], offset); w[33] = amd_bytealign (w[ 0], w[ 1], offset); w[32] = amd_bytealign ( 0, w[ 0], offset); w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 33: w[63] = amd_bytealign (w[29], w[30], offset); w[62] = amd_bytealign (w[28], w[29], offset); w[61] = amd_bytealign (w[27], w[28], offset); w[60] = amd_bytealign (w[26], w[27], offset); w[59] = amd_bytealign (w[25], w[26], offset); w[58] = amd_bytealign (w[24], w[25], offset); w[57] = amd_bytealign (w[23], w[24], offset); w[56] = amd_bytealign (w[22], w[23], offset); w[55] = amd_bytealign (w[21], w[22], offset); w[54] = amd_bytealign (w[20], w[21], offset); w[53] = amd_bytealign (w[19], w[20], offset); w[52] = amd_bytealign (w[18], w[19], offset); w[51] = amd_bytealign (w[17], w[18], offset); w[50] = amd_bytealign (w[16], w[17], offset); w[49] = amd_bytealign (w[15], w[16], offset); w[48] = amd_bytealign (w[14], w[15], offset); w[47] = amd_bytealign (w[13], w[14], offset); w[46] = amd_bytealign (w[12], w[13], offset); w[45] = amd_bytealign (w[11], w[12], offset); w[44] = amd_bytealign (w[10], w[11], offset); w[43] = amd_bytealign (w[ 9], w[10], offset); w[42] = amd_bytealign (w[ 8], w[ 9], offset); w[41] = amd_bytealign (w[ 7], w[ 8], offset); w[40] = amd_bytealign (w[ 6], w[ 7], offset); w[39] = amd_bytealign (w[ 5], w[ 6], offset); w[38] = amd_bytealign (w[ 4], w[ 5], offset); w[37] = amd_bytealign (w[ 3], w[ 4], offset); w[36] = amd_bytealign (w[ 2], w[ 3], offset); w[35] = amd_bytealign (w[ 1], w[ 2], offset); w[34] = amd_bytealign (w[ 0], w[ 1], offset); w[33] = amd_bytealign ( 0, w[ 0], offset); w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 34: w[63] = amd_bytealign (w[28], w[29], offset); w[62] = amd_bytealign (w[27], w[28], offset); w[61] = amd_bytealign (w[26], w[27], offset); w[60] = amd_bytealign (w[25], w[26], offset); w[59] = amd_bytealign (w[24], w[25], offset); w[58] = amd_bytealign (w[23], w[24], offset); w[57] = amd_bytealign (w[22], w[23], offset); w[56] = amd_bytealign (w[21], w[22], offset); w[55] = amd_bytealign (w[20], w[21], offset); w[54] = amd_bytealign (w[19], w[20], offset); w[53] = amd_bytealign (w[18], w[19], offset); w[52] = amd_bytealign (w[17], w[18], offset); w[51] = amd_bytealign (w[16], w[17], offset); w[50] = amd_bytealign (w[15], w[16], offset); w[49] = amd_bytealign (w[14], w[15], offset); w[48] = amd_bytealign (w[13], w[14], offset); w[47] = amd_bytealign (w[12], w[13], offset); w[46] = amd_bytealign (w[11], w[12], offset); w[45] = amd_bytealign (w[10], w[11], offset); w[44] = amd_bytealign (w[ 9], w[10], offset); w[43] = amd_bytealign (w[ 8], w[ 9], offset); w[42] = amd_bytealign (w[ 7], w[ 8], offset); w[41] = amd_bytealign (w[ 6], w[ 7], offset); w[40] = amd_bytealign (w[ 5], w[ 6], offset); w[39] = amd_bytealign (w[ 4], w[ 5], offset); w[38] = amd_bytealign (w[ 3], w[ 4], offset); w[37] = amd_bytealign (w[ 2], w[ 3], offset); w[36] = amd_bytealign (w[ 1], w[ 2], offset); w[35] = amd_bytealign (w[ 0], w[ 1], offset); w[34] = amd_bytealign ( 0, w[ 0], offset); w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 35: w[63] = amd_bytealign (w[27], w[28], offset); w[62] = amd_bytealign (w[26], w[27], offset); w[61] = amd_bytealign (w[25], w[26], offset); w[60] = amd_bytealign (w[24], w[25], offset); w[59] = amd_bytealign (w[23], w[24], offset); w[58] = amd_bytealign (w[22], w[23], offset); w[57] = amd_bytealign (w[21], w[22], offset); w[56] = amd_bytealign (w[20], w[21], offset); w[55] = amd_bytealign (w[19], w[20], offset); w[54] = amd_bytealign (w[18], w[19], offset); w[53] = amd_bytealign (w[17], w[18], offset); w[52] = amd_bytealign (w[16], w[17], offset); w[51] = amd_bytealign (w[15], w[16], offset); w[50] = amd_bytealign (w[14], w[15], offset); w[49] = amd_bytealign (w[13], w[14], offset); w[48] = amd_bytealign (w[12], w[13], offset); w[47] = amd_bytealign (w[11], w[12], offset); w[46] = amd_bytealign (w[10], w[11], offset); w[45] = amd_bytealign (w[ 9], w[10], offset); w[44] = amd_bytealign (w[ 8], w[ 9], offset); w[43] = amd_bytealign (w[ 7], w[ 8], offset); w[42] = amd_bytealign (w[ 6], w[ 7], offset); w[41] = amd_bytealign (w[ 5], w[ 6], offset); w[40] = amd_bytealign (w[ 4], w[ 5], offset); w[39] = amd_bytealign (w[ 3], w[ 4], offset); w[38] = amd_bytealign (w[ 2], w[ 3], offset); w[37] = amd_bytealign (w[ 1], w[ 2], offset); w[36] = amd_bytealign (w[ 0], w[ 1], offset); w[35] = amd_bytealign ( 0, w[ 0], offset); w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 36: w[63] = amd_bytealign (w[26], w[27], offset); w[62] = amd_bytealign (w[25], w[26], offset); w[61] = amd_bytealign (w[24], w[25], offset); w[60] = amd_bytealign (w[23], w[24], offset); w[59] = amd_bytealign (w[22], w[23], offset); w[58] = amd_bytealign (w[21], w[22], offset); w[57] = amd_bytealign (w[20], w[21], offset); w[56] = amd_bytealign (w[19], w[20], offset); w[55] = amd_bytealign (w[18], w[19], offset); w[54] = amd_bytealign (w[17], w[18], offset); w[53] = amd_bytealign (w[16], w[17], offset); w[52] = amd_bytealign (w[15], w[16], offset); w[51] = amd_bytealign (w[14], w[15], offset); w[50] = amd_bytealign (w[13], w[14], offset); w[49] = amd_bytealign (w[12], w[13], offset); w[48] = amd_bytealign (w[11], w[12], offset); w[47] = amd_bytealign (w[10], w[11], offset); w[46] = amd_bytealign (w[ 9], w[10], offset); w[45] = amd_bytealign (w[ 8], w[ 9], offset); w[44] = amd_bytealign (w[ 7], w[ 8], offset); w[43] = amd_bytealign (w[ 6], w[ 7], offset); w[42] = amd_bytealign (w[ 5], w[ 6], offset); w[41] = amd_bytealign (w[ 4], w[ 5], offset); w[40] = amd_bytealign (w[ 3], w[ 4], offset); w[39] = amd_bytealign (w[ 2], w[ 3], offset); w[38] = amd_bytealign (w[ 1], w[ 2], offset); w[37] = amd_bytealign (w[ 0], w[ 1], offset); w[36] = amd_bytealign ( 0, w[ 0], offset); w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 37: w[63] = amd_bytealign (w[25], w[26], offset); w[62] = amd_bytealign (w[24], w[25], offset); w[61] = amd_bytealign (w[23], w[24], offset); w[60] = amd_bytealign (w[22], w[23], offset); w[59] = amd_bytealign (w[21], w[22], offset); w[58] = amd_bytealign (w[20], w[21], offset); w[57] = amd_bytealign (w[19], w[20], offset); w[56] = amd_bytealign (w[18], w[19], offset); w[55] = amd_bytealign (w[17], w[18], offset); w[54] = amd_bytealign (w[16], w[17], offset); w[53] = amd_bytealign (w[15], w[16], offset); w[52] = amd_bytealign (w[14], w[15], offset); w[51] = amd_bytealign (w[13], w[14], offset); w[50] = amd_bytealign (w[12], w[13], offset); w[49] = amd_bytealign (w[11], w[12], offset); w[48] = amd_bytealign (w[10], w[11], offset); w[47] = amd_bytealign (w[ 9], w[10], offset); w[46] = amd_bytealign (w[ 8], w[ 9], offset); w[45] = amd_bytealign (w[ 7], w[ 8], offset); w[44] = amd_bytealign (w[ 6], w[ 7], offset); w[43] = amd_bytealign (w[ 5], w[ 6], offset); w[42] = amd_bytealign (w[ 4], w[ 5], offset); w[41] = amd_bytealign (w[ 3], w[ 4], offset); w[40] = amd_bytealign (w[ 2], w[ 3], offset); w[39] = amd_bytealign (w[ 1], w[ 2], offset); w[38] = amd_bytealign (w[ 0], w[ 1], offset); w[37] = amd_bytealign ( 0, w[ 0], offset); w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 38: w[63] = amd_bytealign (w[24], w[25], offset); w[62] = amd_bytealign (w[23], w[24], offset); w[61] = amd_bytealign (w[22], w[23], offset); w[60] = amd_bytealign (w[21], w[22], offset); w[59] = amd_bytealign (w[20], w[21], offset); w[58] = amd_bytealign (w[19], w[20], offset); w[57] = amd_bytealign (w[18], w[19], offset); w[56] = amd_bytealign (w[17], w[18], offset); w[55] = amd_bytealign (w[16], w[17], offset); w[54] = amd_bytealign (w[15], w[16], offset); w[53] = amd_bytealign (w[14], w[15], offset); w[52] = amd_bytealign (w[13], w[14], offset); w[51] = amd_bytealign (w[12], w[13], offset); w[50] = amd_bytealign (w[11], w[12], offset); w[49] = amd_bytealign (w[10], w[11], offset); w[48] = amd_bytealign (w[ 9], w[10], offset); w[47] = amd_bytealign (w[ 8], w[ 9], offset); w[46] = amd_bytealign (w[ 7], w[ 8], offset); w[45] = amd_bytealign (w[ 6], w[ 7], offset); w[44] = amd_bytealign (w[ 5], w[ 6], offset); w[43] = amd_bytealign (w[ 4], w[ 5], offset); w[42] = amd_bytealign (w[ 3], w[ 4], offset); w[41] = amd_bytealign (w[ 2], w[ 3], offset); w[40] = amd_bytealign (w[ 1], w[ 2], offset); w[39] = amd_bytealign (w[ 0], w[ 1], offset); w[38] = amd_bytealign ( 0, w[ 0], offset); w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 39: w[63] = amd_bytealign (w[23], w[24], offset); w[62] = amd_bytealign (w[22], w[23], offset); w[61] = amd_bytealign (w[21], w[22], offset); w[60] = amd_bytealign (w[20], w[21], offset); w[59] = amd_bytealign (w[19], w[20], offset); w[58] = amd_bytealign (w[18], w[19], offset); w[57] = amd_bytealign (w[17], w[18], offset); w[56] = amd_bytealign (w[16], w[17], offset); w[55] = amd_bytealign (w[15], w[16], offset); w[54] = amd_bytealign (w[14], w[15], offset); w[53] = amd_bytealign (w[13], w[14], offset); w[52] = amd_bytealign (w[12], w[13], offset); w[51] = amd_bytealign (w[11], w[12], offset); w[50] = amd_bytealign (w[10], w[11], offset); w[49] = amd_bytealign (w[ 9], w[10], offset); w[48] = amd_bytealign (w[ 8], w[ 9], offset); w[47] = amd_bytealign (w[ 7], w[ 8], offset); w[46] = amd_bytealign (w[ 6], w[ 7], offset); w[45] = amd_bytealign (w[ 5], w[ 6], offset); w[44] = amd_bytealign (w[ 4], w[ 5], offset); w[43] = amd_bytealign (w[ 3], w[ 4], offset); w[42] = amd_bytealign (w[ 2], w[ 3], offset); w[41] = amd_bytealign (w[ 1], w[ 2], offset); w[40] = amd_bytealign (w[ 0], w[ 1], offset); w[39] = amd_bytealign ( 0, w[ 0], offset); w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 40: w[63] = amd_bytealign (w[22], w[23], offset); w[62] = amd_bytealign (w[21], w[22], offset); w[61] = amd_bytealign (w[20], w[21], offset); w[60] = amd_bytealign (w[19], w[20], offset); w[59] = amd_bytealign (w[18], w[19], offset); w[58] = amd_bytealign (w[17], w[18], offset); w[57] = amd_bytealign (w[16], w[17], offset); w[56] = amd_bytealign (w[15], w[16], offset); w[55] = amd_bytealign (w[14], w[15], offset); w[54] = amd_bytealign (w[13], w[14], offset); w[53] = amd_bytealign (w[12], w[13], offset); w[52] = amd_bytealign (w[11], w[12], offset); w[51] = amd_bytealign (w[10], w[11], offset); w[50] = amd_bytealign (w[ 9], w[10], offset); w[49] = amd_bytealign (w[ 8], w[ 9], offset); w[48] = amd_bytealign (w[ 7], w[ 8], offset); w[47] = amd_bytealign (w[ 6], w[ 7], offset); w[46] = amd_bytealign (w[ 5], w[ 6], offset); w[45] = amd_bytealign (w[ 4], w[ 5], offset); w[44] = amd_bytealign (w[ 3], w[ 4], offset); w[43] = amd_bytealign (w[ 2], w[ 3], offset); w[42] = amd_bytealign (w[ 1], w[ 2], offset); w[41] = amd_bytealign (w[ 0], w[ 1], offset); w[40] = amd_bytealign ( 0, w[ 0], offset); w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 41: w[63] = amd_bytealign (w[21], w[22], offset); w[62] = amd_bytealign (w[20], w[21], offset); w[61] = amd_bytealign (w[19], w[20], offset); w[60] = amd_bytealign (w[18], w[19], offset); w[59] = amd_bytealign (w[17], w[18], offset); w[58] = amd_bytealign (w[16], w[17], offset); w[57] = amd_bytealign (w[15], w[16], offset); w[56] = amd_bytealign (w[14], w[15], offset); w[55] = amd_bytealign (w[13], w[14], offset); w[54] = amd_bytealign (w[12], w[13], offset); w[53] = amd_bytealign (w[11], w[12], offset); w[52] = amd_bytealign (w[10], w[11], offset); w[51] = amd_bytealign (w[ 9], w[10], offset); w[50] = amd_bytealign (w[ 8], w[ 9], offset); w[49] = amd_bytealign (w[ 7], w[ 8], offset); w[48] = amd_bytealign (w[ 6], w[ 7], offset); w[47] = amd_bytealign (w[ 5], w[ 6], offset); w[46] = amd_bytealign (w[ 4], w[ 5], offset); w[45] = amd_bytealign (w[ 3], w[ 4], offset); w[44] = amd_bytealign (w[ 2], w[ 3], offset); w[43] = amd_bytealign (w[ 1], w[ 2], offset); w[42] = amd_bytealign (w[ 0], w[ 1], offset); w[41] = amd_bytealign ( 0, w[ 0], offset); w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 42: w[63] = amd_bytealign (w[20], w[21], offset); w[62] = amd_bytealign (w[19], w[20], offset); w[61] = amd_bytealign (w[18], w[19], offset); w[60] = amd_bytealign (w[17], w[18], offset); w[59] = amd_bytealign (w[16], w[17], offset); w[58] = amd_bytealign (w[15], w[16], offset); w[57] = amd_bytealign (w[14], w[15], offset); w[56] = amd_bytealign (w[13], w[14], offset); w[55] = amd_bytealign (w[12], w[13], offset); w[54] = amd_bytealign (w[11], w[12], offset); w[53] = amd_bytealign (w[10], w[11], offset); w[52] = amd_bytealign (w[ 9], w[10], offset); w[51] = amd_bytealign (w[ 8], w[ 9], offset); w[50] = amd_bytealign (w[ 7], w[ 8], offset); w[49] = amd_bytealign (w[ 6], w[ 7], offset); w[48] = amd_bytealign (w[ 5], w[ 6], offset); w[47] = amd_bytealign (w[ 4], w[ 5], offset); w[46] = amd_bytealign (w[ 3], w[ 4], offset); w[45] = amd_bytealign (w[ 2], w[ 3], offset); w[44] = amd_bytealign (w[ 1], w[ 2], offset); w[43] = amd_bytealign (w[ 0], w[ 1], offset); w[42] = amd_bytealign ( 0, w[ 0], offset); w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 43: w[63] = amd_bytealign (w[19], w[20], offset); w[62] = amd_bytealign (w[18], w[19], offset); w[61] = amd_bytealign (w[17], w[18], offset); w[60] = amd_bytealign (w[16], w[17], offset); w[59] = amd_bytealign (w[15], w[16], offset); w[58] = amd_bytealign (w[14], w[15], offset); w[57] = amd_bytealign (w[13], w[14], offset); w[56] = amd_bytealign (w[12], w[13], offset); w[55] = amd_bytealign (w[11], w[12], offset); w[54] = amd_bytealign (w[10], w[11], offset); w[53] = amd_bytealign (w[ 9], w[10], offset); w[52] = amd_bytealign (w[ 8], w[ 9], offset); w[51] = amd_bytealign (w[ 7], w[ 8], offset); w[50] = amd_bytealign (w[ 6], w[ 7], offset); w[49] = amd_bytealign (w[ 5], w[ 6], offset); w[48] = amd_bytealign (w[ 4], w[ 5], offset); w[47] = amd_bytealign (w[ 3], w[ 4], offset); w[46] = amd_bytealign (w[ 2], w[ 3], offset); w[45] = amd_bytealign (w[ 1], w[ 2], offset); w[44] = amd_bytealign (w[ 0], w[ 1], offset); w[43] = amd_bytealign ( 0, w[ 0], offset); w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 44: w[63] = amd_bytealign (w[18], w[19], offset); w[62] = amd_bytealign (w[17], w[18], offset); w[61] = amd_bytealign (w[16], w[17], offset); w[60] = amd_bytealign (w[15], w[16], offset); w[59] = amd_bytealign (w[14], w[15], offset); w[58] = amd_bytealign (w[13], w[14], offset); w[57] = amd_bytealign (w[12], w[13], offset); w[56] = amd_bytealign (w[11], w[12], offset); w[55] = amd_bytealign (w[10], w[11], offset); w[54] = amd_bytealign (w[ 9], w[10], offset); w[53] = amd_bytealign (w[ 8], w[ 9], offset); w[52] = amd_bytealign (w[ 7], w[ 8], offset); w[51] = amd_bytealign (w[ 6], w[ 7], offset); w[50] = amd_bytealign (w[ 5], w[ 6], offset); w[49] = amd_bytealign (w[ 4], w[ 5], offset); w[48] = amd_bytealign (w[ 3], w[ 4], offset); w[47] = amd_bytealign (w[ 2], w[ 3], offset); w[46] = amd_bytealign (w[ 1], w[ 2], offset); w[45] = amd_bytealign (w[ 0], w[ 1], offset); w[44] = amd_bytealign ( 0, w[ 0], offset); w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 45: w[63] = amd_bytealign (w[17], w[18], offset); w[62] = amd_bytealign (w[16], w[17], offset); w[61] = amd_bytealign (w[15], w[16], offset); w[60] = amd_bytealign (w[14], w[15], offset); w[59] = amd_bytealign (w[13], w[14], offset); w[58] = amd_bytealign (w[12], w[13], offset); w[57] = amd_bytealign (w[11], w[12], offset); w[56] = amd_bytealign (w[10], w[11], offset); w[55] = amd_bytealign (w[ 9], w[10], offset); w[54] = amd_bytealign (w[ 8], w[ 9], offset); w[53] = amd_bytealign (w[ 7], w[ 8], offset); w[52] = amd_bytealign (w[ 6], w[ 7], offset); w[51] = amd_bytealign (w[ 5], w[ 6], offset); w[50] = amd_bytealign (w[ 4], w[ 5], offset); w[49] = amd_bytealign (w[ 3], w[ 4], offset); w[48] = amd_bytealign (w[ 2], w[ 3], offset); w[47] = amd_bytealign (w[ 1], w[ 2], offset); w[46] = amd_bytealign (w[ 0], w[ 1], offset); w[45] = amd_bytealign ( 0, w[ 0], offset); w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 46: w[63] = amd_bytealign (w[16], w[17], offset); w[62] = amd_bytealign (w[15], w[16], offset); w[61] = amd_bytealign (w[14], w[15], offset); w[60] = amd_bytealign (w[13], w[14], offset); w[59] = amd_bytealign (w[12], w[13], offset); w[58] = amd_bytealign (w[11], w[12], offset); w[57] = amd_bytealign (w[10], w[11], offset); w[56] = amd_bytealign (w[ 9], w[10], offset); w[55] = amd_bytealign (w[ 8], w[ 9], offset); w[54] = amd_bytealign (w[ 7], w[ 8], offset); w[53] = amd_bytealign (w[ 6], w[ 7], offset); w[52] = amd_bytealign (w[ 5], w[ 6], offset); w[51] = amd_bytealign (w[ 4], w[ 5], offset); w[50] = amd_bytealign (w[ 3], w[ 4], offset); w[49] = amd_bytealign (w[ 2], w[ 3], offset); w[48] = amd_bytealign (w[ 1], w[ 2], offset); w[47] = amd_bytealign (w[ 0], w[ 1], offset); w[46] = amd_bytealign ( 0, w[ 0], offset); w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 47: w[63] = amd_bytealign (w[15], w[16], offset); w[62] = amd_bytealign (w[14], w[15], offset); w[61] = amd_bytealign (w[13], w[14], offset); w[60] = amd_bytealign (w[12], w[13], offset); w[59] = amd_bytealign (w[11], w[12], offset); w[58] = amd_bytealign (w[10], w[11], offset); w[57] = amd_bytealign (w[ 9], w[10], offset); w[56] = amd_bytealign (w[ 8], w[ 9], offset); w[55] = amd_bytealign (w[ 7], w[ 8], offset); w[54] = amd_bytealign (w[ 6], w[ 7], offset); w[53] = amd_bytealign (w[ 5], w[ 6], offset); w[52] = amd_bytealign (w[ 4], w[ 5], offset); w[51] = amd_bytealign (w[ 3], w[ 4], offset); w[50] = amd_bytealign (w[ 2], w[ 3], offset); w[49] = amd_bytealign (w[ 1], w[ 2], offset); w[48] = amd_bytealign (w[ 0], w[ 1], offset); w[47] = amd_bytealign ( 0, w[ 0], offset); w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 48: w[63] = amd_bytealign (w[14], w[15], offset); w[62] = amd_bytealign (w[13], w[14], offset); w[61] = amd_bytealign (w[12], w[13], offset); w[60] = amd_bytealign (w[11], w[12], offset); w[59] = amd_bytealign (w[10], w[11], offset); w[58] = amd_bytealign (w[ 9], w[10], offset); w[57] = amd_bytealign (w[ 8], w[ 9], offset); w[56] = amd_bytealign (w[ 7], w[ 8], offset); w[55] = amd_bytealign (w[ 6], w[ 7], offset); w[54] = amd_bytealign (w[ 5], w[ 6], offset); w[53] = amd_bytealign (w[ 4], w[ 5], offset); w[52] = amd_bytealign (w[ 3], w[ 4], offset); w[51] = amd_bytealign (w[ 2], w[ 3], offset); w[50] = amd_bytealign (w[ 1], w[ 2], offset); w[49] = amd_bytealign (w[ 0], w[ 1], offset); w[48] = amd_bytealign ( 0, w[ 0], offset); w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 49: w[63] = amd_bytealign (w[13], w[14], offset); w[62] = amd_bytealign (w[12], w[13], offset); w[61] = amd_bytealign (w[11], w[12], offset); w[60] = amd_bytealign (w[10], w[11], offset); w[59] = amd_bytealign (w[ 9], w[10], offset); w[58] = amd_bytealign (w[ 8], w[ 9], offset); w[57] = amd_bytealign (w[ 7], w[ 8], offset); w[56] = amd_bytealign (w[ 6], w[ 7], offset); w[55] = amd_bytealign (w[ 5], w[ 6], offset); w[54] = amd_bytealign (w[ 4], w[ 5], offset); w[53] = amd_bytealign (w[ 3], w[ 4], offset); w[52] = amd_bytealign (w[ 2], w[ 3], offset); w[51] = amd_bytealign (w[ 1], w[ 2], offset); w[50] = amd_bytealign (w[ 0], w[ 1], offset); w[49] = amd_bytealign ( 0, w[ 0], offset); w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 50: w[63] = amd_bytealign (w[12], w[13], offset); w[62] = amd_bytealign (w[11], w[12], offset); w[61] = amd_bytealign (w[10], w[11], offset); w[60] = amd_bytealign (w[ 9], w[10], offset); w[59] = amd_bytealign (w[ 8], w[ 9], offset); w[58] = amd_bytealign (w[ 7], w[ 8], offset); w[57] = amd_bytealign (w[ 6], w[ 7], offset); w[56] = amd_bytealign (w[ 5], w[ 6], offset); w[55] = amd_bytealign (w[ 4], w[ 5], offset); w[54] = amd_bytealign (w[ 3], w[ 4], offset); w[53] = amd_bytealign (w[ 2], w[ 3], offset); w[52] = amd_bytealign (w[ 1], w[ 2], offset); w[51] = amd_bytealign (w[ 0], w[ 1], offset); w[50] = amd_bytealign ( 0, w[ 0], offset); w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 51: w[63] = amd_bytealign (w[11], w[12], offset); w[62] = amd_bytealign (w[10], w[11], offset); w[61] = amd_bytealign (w[ 9], w[10], offset); w[60] = amd_bytealign (w[ 8], w[ 9], offset); w[59] = amd_bytealign (w[ 7], w[ 8], offset); w[58] = amd_bytealign (w[ 6], w[ 7], offset); w[57] = amd_bytealign (w[ 5], w[ 6], offset); w[56] = amd_bytealign (w[ 4], w[ 5], offset); w[55] = amd_bytealign (w[ 3], w[ 4], offset); w[54] = amd_bytealign (w[ 2], w[ 3], offset); w[53] = amd_bytealign (w[ 1], w[ 2], offset); w[52] = amd_bytealign (w[ 0], w[ 1], offset); w[51] = amd_bytealign ( 0, w[ 0], offset); w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 52: w[63] = amd_bytealign (w[10], w[11], offset); w[62] = amd_bytealign (w[ 9], w[10], offset); w[61] = amd_bytealign (w[ 8], w[ 9], offset); w[60] = amd_bytealign (w[ 7], w[ 8], offset); w[59] = amd_bytealign (w[ 6], w[ 7], offset); w[58] = amd_bytealign (w[ 5], w[ 6], offset); w[57] = amd_bytealign (w[ 4], w[ 5], offset); w[56] = amd_bytealign (w[ 3], w[ 4], offset); w[55] = amd_bytealign (w[ 2], w[ 3], offset); w[54] = amd_bytealign (w[ 1], w[ 2], offset); w[53] = amd_bytealign (w[ 0], w[ 1], offset); w[52] = amd_bytealign ( 0, w[ 0], offset); w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 53: w[63] = amd_bytealign (w[ 9], w[10], offset); w[62] = amd_bytealign (w[ 8], w[ 9], offset); w[61] = amd_bytealign (w[ 7], w[ 8], offset); w[60] = amd_bytealign (w[ 6], w[ 7], offset); w[59] = amd_bytealign (w[ 5], w[ 6], offset); w[58] = amd_bytealign (w[ 4], w[ 5], offset); w[57] = amd_bytealign (w[ 3], w[ 4], offset); w[56] = amd_bytealign (w[ 2], w[ 3], offset); w[55] = amd_bytealign (w[ 1], w[ 2], offset); w[54] = amd_bytealign (w[ 0], w[ 1], offset); w[53] = amd_bytealign ( 0, w[ 0], offset); w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 54: w[63] = amd_bytealign (w[ 8], w[ 9], offset); w[62] = amd_bytealign (w[ 7], w[ 8], offset); w[61] = amd_bytealign (w[ 6], w[ 7], offset); w[60] = amd_bytealign (w[ 5], w[ 6], offset); w[59] = amd_bytealign (w[ 4], w[ 5], offset); w[58] = amd_bytealign (w[ 3], w[ 4], offset); w[57] = amd_bytealign (w[ 2], w[ 3], offset); w[56] = amd_bytealign (w[ 1], w[ 2], offset); w[55] = amd_bytealign (w[ 0], w[ 1], offset); w[54] = amd_bytealign ( 0, w[ 0], offset); w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 55: w[63] = amd_bytealign (w[ 7], w[ 8], offset); w[62] = amd_bytealign (w[ 6], w[ 7], offset); w[61] = amd_bytealign (w[ 5], w[ 6], offset); w[60] = amd_bytealign (w[ 4], w[ 5], offset); w[59] = amd_bytealign (w[ 3], w[ 4], offset); w[58] = amd_bytealign (w[ 2], w[ 3], offset); w[57] = amd_bytealign (w[ 1], w[ 2], offset); w[56] = amd_bytealign (w[ 0], w[ 1], offset); w[55] = amd_bytealign ( 0, w[ 0], offset); w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 56: w[63] = amd_bytealign (w[ 6], w[ 7], offset); w[62] = amd_bytealign (w[ 5], w[ 6], offset); w[61] = amd_bytealign (w[ 4], w[ 5], offset); w[60] = amd_bytealign (w[ 3], w[ 4], offset); w[59] = amd_bytealign (w[ 2], w[ 3], offset); w[58] = amd_bytealign (w[ 1], w[ 2], offset); w[57] = amd_bytealign (w[ 0], w[ 1], offset); w[56] = amd_bytealign ( 0, w[ 0], offset); w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 57: w[63] = amd_bytealign (w[ 5], w[ 6], offset); w[62] = amd_bytealign (w[ 4], w[ 5], offset); w[61] = amd_bytealign (w[ 3], w[ 4], offset); w[60] = amd_bytealign (w[ 2], w[ 3], offset); w[59] = amd_bytealign (w[ 1], w[ 2], offset); w[58] = amd_bytealign (w[ 0], w[ 1], offset); w[57] = amd_bytealign ( 0, w[ 0], offset); w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 58: w[63] = amd_bytealign (w[ 4], w[ 5], offset); w[62] = amd_bytealign (w[ 3], w[ 4], offset); w[61] = amd_bytealign (w[ 2], w[ 3], offset); w[60] = amd_bytealign (w[ 1], w[ 2], offset); w[59] = amd_bytealign (w[ 0], w[ 1], offset); w[58] = amd_bytealign ( 0, w[ 0], offset); w[57] = 0; w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 59: w[63] = amd_bytealign (w[ 3], w[ 4], offset); w[62] = amd_bytealign (w[ 2], w[ 3], offset); w[61] = amd_bytealign (w[ 1], w[ 2], offset); w[60] = amd_bytealign (w[ 0], w[ 1], offset); w[59] = amd_bytealign ( 0, w[ 0], offset); w[58] = 0; w[57] = 0; w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 60: w[63] = amd_bytealign (w[ 2], w[ 3], offset); w[62] = amd_bytealign (w[ 1], w[ 2], offset); w[61] = amd_bytealign (w[ 0], w[ 1], offset); w[60] = amd_bytealign ( 0, w[ 0], offset); w[59] = 0; w[58] = 0; w[57] = 0; w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 61: w[63] = amd_bytealign (w[ 1], w[ 2], offset); w[62] = amd_bytealign (w[ 0], w[ 1], offset); w[61] = amd_bytealign ( 0, w[ 0], offset); w[60] = 0; w[59] = 0; w[58] = 0; w[57] = 0; w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 62: w[63] = amd_bytealign (w[ 0], w[ 1], offset); w[62] = amd_bytealign ( 0, w[ 0], offset); w[61] = 0; w[60] = 0; w[59] = 0; w[58] = 0; w[57] = 0; w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 63: w[63] = amd_bytealign ( 0, w[ 0], offset); w[62] = 0; w[61] = 0; w[60] = 0; w[59] = 0; w[58] = 0; w[57] = 0; w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; } #endif #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV #if defined IS_NV const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; #endif #if defined IS_AMD const int selector = 0x0706050403020100 >> ((offset & 3) * 8); #endif switch (offset_switch) { case 0: w[63] = __byte_perm (w[63], w[62], selector); w[62] = __byte_perm (w[62], w[61], selector); w[61] = __byte_perm (w[61], w[60], selector); w[60] = __byte_perm (w[60], w[59], selector); w[59] = __byte_perm (w[59], w[58], selector); w[58] = __byte_perm (w[58], w[57], selector); w[57] = __byte_perm (w[57], w[56], selector); w[56] = __byte_perm (w[56], w[55], selector); w[55] = __byte_perm (w[55], w[54], selector); w[54] = __byte_perm (w[54], w[53], selector); w[53] = __byte_perm (w[53], w[52], selector); w[52] = __byte_perm (w[52], w[51], selector); w[51] = __byte_perm (w[51], w[50], selector); w[50] = __byte_perm (w[50], w[49], selector); w[49] = __byte_perm (w[49], w[48], selector); w[48] = __byte_perm (w[48], w[47], selector); w[47] = __byte_perm (w[47], w[46], selector); w[46] = __byte_perm (w[46], w[45], selector); w[45] = __byte_perm (w[45], w[44], selector); w[44] = __byte_perm (w[44], w[43], selector); w[43] = __byte_perm (w[43], w[42], selector); w[42] = __byte_perm (w[42], w[41], selector); w[41] = __byte_perm (w[41], w[40], selector); w[40] = __byte_perm (w[40], w[39], selector); w[39] = __byte_perm (w[39], w[38], selector); w[38] = __byte_perm (w[38], w[37], selector); w[37] = __byte_perm (w[37], w[36], selector); w[36] = __byte_perm (w[36], w[35], selector); w[35] = __byte_perm (w[35], w[34], selector); w[34] = __byte_perm (w[34], w[33], selector); w[33] = __byte_perm (w[33], w[32], selector); w[32] = __byte_perm (w[32], w[31], selector); w[31] = __byte_perm (w[31], w[30], selector); w[30] = __byte_perm (w[30], w[29], selector); w[29] = __byte_perm (w[29], w[28], selector); w[28] = __byte_perm (w[28], w[27], selector); w[27] = __byte_perm (w[27], w[26], selector); w[26] = __byte_perm (w[26], w[25], selector); w[25] = __byte_perm (w[25], w[24], selector); w[24] = __byte_perm (w[24], w[23], selector); w[23] = __byte_perm (w[23], w[22], selector); w[22] = __byte_perm (w[22], w[21], selector); w[21] = __byte_perm (w[21], w[20], selector); w[20] = __byte_perm (w[20], w[19], selector); w[19] = __byte_perm (w[19], w[18], selector); w[18] = __byte_perm (w[18], w[17], selector); w[17] = __byte_perm (w[17], w[16], selector); w[16] = __byte_perm (w[16], w[15], selector); w[15] = __byte_perm (w[15], w[14], selector); w[14] = __byte_perm (w[14], w[13], selector); w[13] = __byte_perm (w[13], w[12], selector); w[12] = __byte_perm (w[12], w[11], selector); w[11] = __byte_perm (w[11], w[10], selector); w[10] = __byte_perm (w[10], w[ 9], selector); w[ 9] = __byte_perm (w[ 9], w[ 8], selector); w[ 8] = __byte_perm (w[ 8], w[ 7], selector); w[ 7] = __byte_perm (w[ 7], w[ 6], selector); w[ 6] = __byte_perm (w[ 6], w[ 5], selector); w[ 5] = __byte_perm (w[ 5], w[ 4], selector); w[ 4] = __byte_perm (w[ 4], w[ 3], selector); w[ 3] = __byte_perm (w[ 3], w[ 2], selector); w[ 2] = __byte_perm (w[ 2], w[ 1], selector); w[ 1] = __byte_perm (w[ 1], w[ 0], selector); w[ 0] = __byte_perm (w[ 0], 0, selector); break; case 1: w[63] = __byte_perm (w[62], w[61], selector); w[62] = __byte_perm (w[61], w[60], selector); w[61] = __byte_perm (w[60], w[59], selector); w[60] = __byte_perm (w[59], w[58], selector); w[59] = __byte_perm (w[58], w[57], selector); w[58] = __byte_perm (w[57], w[56], selector); w[57] = __byte_perm (w[56], w[55], selector); w[56] = __byte_perm (w[55], w[54], selector); w[55] = __byte_perm (w[54], w[53], selector); w[54] = __byte_perm (w[53], w[52], selector); w[53] = __byte_perm (w[52], w[51], selector); w[52] = __byte_perm (w[51], w[50], selector); w[51] = __byte_perm (w[50], w[49], selector); w[50] = __byte_perm (w[49], w[48], selector); w[49] = __byte_perm (w[48], w[47], selector); w[48] = __byte_perm (w[47], w[46], selector); w[47] = __byte_perm (w[46], w[45], selector); w[46] = __byte_perm (w[45], w[44], selector); w[45] = __byte_perm (w[44], w[43], selector); w[44] = __byte_perm (w[43], w[42], selector); w[43] = __byte_perm (w[42], w[41], selector); w[42] = __byte_perm (w[41], w[40], selector); w[41] = __byte_perm (w[40], w[39], selector); w[40] = __byte_perm (w[39], w[38], selector); w[39] = __byte_perm (w[38], w[37], selector); w[38] = __byte_perm (w[37], w[36], selector); w[37] = __byte_perm (w[36], w[35], selector); w[36] = __byte_perm (w[35], w[34], selector); w[35] = __byte_perm (w[34], w[33], selector); w[34] = __byte_perm (w[33], w[32], selector); w[33] = __byte_perm (w[32], w[31], selector); w[32] = __byte_perm (w[31], w[30], selector); w[31] = __byte_perm (w[30], w[29], selector); w[30] = __byte_perm (w[29], w[28], selector); w[29] = __byte_perm (w[28], w[27], selector); w[28] = __byte_perm (w[27], w[26], selector); w[27] = __byte_perm (w[26], w[25], selector); w[26] = __byte_perm (w[25], w[24], selector); w[25] = __byte_perm (w[24], w[23], selector); w[24] = __byte_perm (w[23], w[22], selector); w[23] = __byte_perm (w[22], w[21], selector); w[22] = __byte_perm (w[21], w[20], selector); w[21] = __byte_perm (w[20], w[19], selector); w[20] = __byte_perm (w[19], w[18], selector); w[19] = __byte_perm (w[18], w[17], selector); w[18] = __byte_perm (w[17], w[16], selector); w[17] = __byte_perm (w[16], w[15], selector); w[16] = __byte_perm (w[15], w[14], selector); w[15] = __byte_perm (w[14], w[13], selector); w[14] = __byte_perm (w[13], w[12], selector); w[13] = __byte_perm (w[12], w[11], selector); w[12] = __byte_perm (w[11], w[10], selector); w[11] = __byte_perm (w[10], w[ 9], selector); w[10] = __byte_perm (w[ 9], w[ 8], selector); w[ 9] = __byte_perm (w[ 8], w[ 7], selector); w[ 8] = __byte_perm (w[ 7], w[ 6], selector); w[ 7] = __byte_perm (w[ 6], w[ 5], selector); w[ 6] = __byte_perm (w[ 5], w[ 4], selector); w[ 5] = __byte_perm (w[ 4], w[ 3], selector); w[ 4] = __byte_perm (w[ 3], w[ 2], selector); w[ 3] = __byte_perm (w[ 2], w[ 1], selector); w[ 2] = __byte_perm (w[ 1], w[ 0], selector); w[ 1] = __byte_perm (w[ 0], 0, selector); w[ 0] = 0; break; case 2: w[63] = __byte_perm (w[61], w[60], selector); w[62] = __byte_perm (w[60], w[59], selector); w[61] = __byte_perm (w[59], w[58], selector); w[60] = __byte_perm (w[58], w[57], selector); w[59] = __byte_perm (w[57], w[56], selector); w[58] = __byte_perm (w[56], w[55], selector); w[57] = __byte_perm (w[55], w[54], selector); w[56] = __byte_perm (w[54], w[53], selector); w[55] = __byte_perm (w[53], w[52], selector); w[54] = __byte_perm (w[52], w[51], selector); w[53] = __byte_perm (w[51], w[50], selector); w[52] = __byte_perm (w[50], w[49], selector); w[51] = __byte_perm (w[49], w[48], selector); w[50] = __byte_perm (w[48], w[47], selector); w[49] = __byte_perm (w[47], w[46], selector); w[48] = __byte_perm (w[46], w[45], selector); w[47] = __byte_perm (w[45], w[44], selector); w[46] = __byte_perm (w[44], w[43], selector); w[45] = __byte_perm (w[43], w[42], selector); w[44] = __byte_perm (w[42], w[41], selector); w[43] = __byte_perm (w[41], w[40], selector); w[42] = __byte_perm (w[40], w[39], selector); w[41] = __byte_perm (w[39], w[38], selector); w[40] = __byte_perm (w[38], w[37], selector); w[39] = __byte_perm (w[37], w[36], selector); w[38] = __byte_perm (w[36], w[35], selector); w[37] = __byte_perm (w[35], w[34], selector); w[36] = __byte_perm (w[34], w[33], selector); w[35] = __byte_perm (w[33], w[32], selector); w[34] = __byte_perm (w[32], w[31], selector); w[33] = __byte_perm (w[31], w[30], selector); w[32] = __byte_perm (w[30], w[29], selector); w[31] = __byte_perm (w[29], w[28], selector); w[30] = __byte_perm (w[28], w[27], selector); w[29] = __byte_perm (w[27], w[26], selector); w[28] = __byte_perm (w[26], w[25], selector); w[27] = __byte_perm (w[25], w[24], selector); w[26] = __byte_perm (w[24], w[23], selector); w[25] = __byte_perm (w[23], w[22], selector); w[24] = __byte_perm (w[22], w[21], selector); w[23] = __byte_perm (w[21], w[20], selector); w[22] = __byte_perm (w[20], w[19], selector); w[21] = __byte_perm (w[19], w[18], selector); w[20] = __byte_perm (w[18], w[17], selector); w[19] = __byte_perm (w[17], w[16], selector); w[18] = __byte_perm (w[16], w[15], selector); w[17] = __byte_perm (w[15], w[14], selector); w[16] = __byte_perm (w[14], w[13], selector); w[15] = __byte_perm (w[13], w[12], selector); w[14] = __byte_perm (w[12], w[11], selector); w[13] = __byte_perm (w[11], w[10], selector); w[12] = __byte_perm (w[10], w[ 9], selector); w[11] = __byte_perm (w[ 9], w[ 8], selector); w[10] = __byte_perm (w[ 8], w[ 7], selector); w[ 9] = __byte_perm (w[ 7], w[ 6], selector); w[ 8] = __byte_perm (w[ 6], w[ 5], selector); w[ 7] = __byte_perm (w[ 5], w[ 4], selector); w[ 6] = __byte_perm (w[ 4], w[ 3], selector); w[ 5] = __byte_perm (w[ 3], w[ 2], selector); w[ 4] = __byte_perm (w[ 2], w[ 1], selector); w[ 3] = __byte_perm (w[ 1], w[ 0], selector); w[ 2] = __byte_perm (w[ 0], 0, selector); w[ 1] = 0; w[ 0] = 0; break; case 3: w[63] = __byte_perm (w[60], w[59], selector); w[62] = __byte_perm (w[59], w[58], selector); w[61] = __byte_perm (w[58], w[57], selector); w[60] = __byte_perm (w[57], w[56], selector); w[59] = __byte_perm (w[56], w[55], selector); w[58] = __byte_perm (w[55], w[54], selector); w[57] = __byte_perm (w[54], w[53], selector); w[56] = __byte_perm (w[53], w[52], selector); w[55] = __byte_perm (w[52], w[51], selector); w[54] = __byte_perm (w[51], w[50], selector); w[53] = __byte_perm (w[50], w[49], selector); w[52] = __byte_perm (w[49], w[48], selector); w[51] = __byte_perm (w[48], w[47], selector); w[50] = __byte_perm (w[47], w[46], selector); w[49] = __byte_perm (w[46], w[45], selector); w[48] = __byte_perm (w[45], w[44], selector); w[47] = __byte_perm (w[44], w[43], selector); w[46] = __byte_perm (w[43], w[42], selector); w[45] = __byte_perm (w[42], w[41], selector); w[44] = __byte_perm (w[41], w[40], selector); w[43] = __byte_perm (w[40], w[39], selector); w[42] = __byte_perm (w[39], w[38], selector); w[41] = __byte_perm (w[38], w[37], selector); w[40] = __byte_perm (w[37], w[36], selector); w[39] = __byte_perm (w[36], w[35], selector); w[38] = __byte_perm (w[35], w[34], selector); w[37] = __byte_perm (w[34], w[33], selector); w[36] = __byte_perm (w[33], w[32], selector); w[35] = __byte_perm (w[32], w[31], selector); w[34] = __byte_perm (w[31], w[30], selector); w[33] = __byte_perm (w[30], w[29], selector); w[32] = __byte_perm (w[29], w[28], selector); w[31] = __byte_perm (w[28], w[27], selector); w[30] = __byte_perm (w[27], w[26], selector); w[29] = __byte_perm (w[26], w[25], selector); w[28] = __byte_perm (w[25], w[24], selector); w[27] = __byte_perm (w[24], w[23], selector); w[26] = __byte_perm (w[23], w[22], selector); w[25] = __byte_perm (w[22], w[21], selector); w[24] = __byte_perm (w[21], w[20], selector); w[23] = __byte_perm (w[20], w[19], selector); w[22] = __byte_perm (w[19], w[18], selector); w[21] = __byte_perm (w[18], w[17], selector); w[20] = __byte_perm (w[17], w[16], selector); w[19] = __byte_perm (w[16], w[15], selector); w[18] = __byte_perm (w[15], w[14], selector); w[17] = __byte_perm (w[14], w[13], selector); w[16] = __byte_perm (w[13], w[12], selector); w[15] = __byte_perm (w[12], w[11], selector); w[14] = __byte_perm (w[11], w[10], selector); w[13] = __byte_perm (w[10], w[ 9], selector); w[12] = __byte_perm (w[ 9], w[ 8], selector); w[11] = __byte_perm (w[ 8], w[ 7], selector); w[10] = __byte_perm (w[ 7], w[ 6], selector); w[ 9] = __byte_perm (w[ 6], w[ 5], selector); w[ 8] = __byte_perm (w[ 5], w[ 4], selector); w[ 7] = __byte_perm (w[ 4], w[ 3], selector); w[ 6] = __byte_perm (w[ 3], w[ 2], selector); w[ 5] = __byte_perm (w[ 2], w[ 1], selector); w[ 4] = __byte_perm (w[ 1], w[ 0], selector); w[ 3] = __byte_perm (w[ 0], 0, selector); w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 4: w[63] = __byte_perm (w[59], w[58], selector); w[62] = __byte_perm (w[58], w[57], selector); w[61] = __byte_perm (w[57], w[56], selector); w[60] = __byte_perm (w[56], w[55], selector); w[59] = __byte_perm (w[55], w[54], selector); w[58] = __byte_perm (w[54], w[53], selector); w[57] = __byte_perm (w[53], w[52], selector); w[56] = __byte_perm (w[52], w[51], selector); w[55] = __byte_perm (w[51], w[50], selector); w[54] = __byte_perm (w[50], w[49], selector); w[53] = __byte_perm (w[49], w[48], selector); w[52] = __byte_perm (w[48], w[47], selector); w[51] = __byte_perm (w[47], w[46], selector); w[50] = __byte_perm (w[46], w[45], selector); w[49] = __byte_perm (w[45], w[44], selector); w[48] = __byte_perm (w[44], w[43], selector); w[47] = __byte_perm (w[43], w[42], selector); w[46] = __byte_perm (w[42], w[41], selector); w[45] = __byte_perm (w[41], w[40], selector); w[44] = __byte_perm (w[40], w[39], selector); w[43] = __byte_perm (w[39], w[38], selector); w[42] = __byte_perm (w[38], w[37], selector); w[41] = __byte_perm (w[37], w[36], selector); w[40] = __byte_perm (w[36], w[35], selector); w[39] = __byte_perm (w[35], w[34], selector); w[38] = __byte_perm (w[34], w[33], selector); w[37] = __byte_perm (w[33], w[32], selector); w[36] = __byte_perm (w[32], w[31], selector); w[35] = __byte_perm (w[31], w[30], selector); w[34] = __byte_perm (w[30], w[29], selector); w[33] = __byte_perm (w[29], w[28], selector); w[32] = __byte_perm (w[28], w[27], selector); w[31] = __byte_perm (w[27], w[26], selector); w[30] = __byte_perm (w[26], w[25], selector); w[29] = __byte_perm (w[25], w[24], selector); w[28] = __byte_perm (w[24], w[23], selector); w[27] = __byte_perm (w[23], w[22], selector); w[26] = __byte_perm (w[22], w[21], selector); w[25] = __byte_perm (w[21], w[20], selector); w[24] = __byte_perm (w[20], w[19], selector); w[23] = __byte_perm (w[19], w[18], selector); w[22] = __byte_perm (w[18], w[17], selector); w[21] = __byte_perm (w[17], w[16], selector); w[20] = __byte_perm (w[16], w[15], selector); w[19] = __byte_perm (w[15], w[14], selector); w[18] = __byte_perm (w[14], w[13], selector); w[17] = __byte_perm (w[13], w[12], selector); w[16] = __byte_perm (w[12], w[11], selector); w[15] = __byte_perm (w[11], w[10], selector); w[14] = __byte_perm (w[10], w[ 9], selector); w[13] = __byte_perm (w[ 9], w[ 8], selector); w[12] = __byte_perm (w[ 8], w[ 7], selector); w[11] = __byte_perm (w[ 7], w[ 6], selector); w[10] = __byte_perm (w[ 6], w[ 5], selector); w[ 9] = __byte_perm (w[ 5], w[ 4], selector); w[ 8] = __byte_perm (w[ 4], w[ 3], selector); w[ 7] = __byte_perm (w[ 3], w[ 2], selector); w[ 6] = __byte_perm (w[ 2], w[ 1], selector); w[ 5] = __byte_perm (w[ 1], w[ 0], selector); w[ 4] = __byte_perm (w[ 0], 0, selector); w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 5: w[63] = __byte_perm (w[58], w[57], selector); w[62] = __byte_perm (w[57], w[56], selector); w[61] = __byte_perm (w[56], w[55], selector); w[60] = __byte_perm (w[55], w[54], selector); w[59] = __byte_perm (w[54], w[53], selector); w[58] = __byte_perm (w[53], w[52], selector); w[57] = __byte_perm (w[52], w[51], selector); w[56] = __byte_perm (w[51], w[50], selector); w[55] = __byte_perm (w[50], w[49], selector); w[54] = __byte_perm (w[49], w[48], selector); w[53] = __byte_perm (w[48], w[47], selector); w[52] = __byte_perm (w[47], w[46], selector); w[51] = __byte_perm (w[46], w[45], selector); w[50] = __byte_perm (w[45], w[44], selector); w[49] = __byte_perm (w[44], w[43], selector); w[48] = __byte_perm (w[43], w[42], selector); w[47] = __byte_perm (w[42], w[41], selector); w[46] = __byte_perm (w[41], w[40], selector); w[45] = __byte_perm (w[40], w[39], selector); w[44] = __byte_perm (w[39], w[38], selector); w[43] = __byte_perm (w[38], w[37], selector); w[42] = __byte_perm (w[37], w[36], selector); w[41] = __byte_perm (w[36], w[35], selector); w[40] = __byte_perm (w[35], w[34], selector); w[39] = __byte_perm (w[34], w[33], selector); w[38] = __byte_perm (w[33], w[32], selector); w[37] = __byte_perm (w[32], w[31], selector); w[36] = __byte_perm (w[31], w[30], selector); w[35] = __byte_perm (w[30], w[29], selector); w[34] = __byte_perm (w[29], w[28], selector); w[33] = __byte_perm (w[28], w[27], selector); w[32] = __byte_perm (w[27], w[26], selector); w[31] = __byte_perm (w[26], w[25], selector); w[30] = __byte_perm (w[25], w[24], selector); w[29] = __byte_perm (w[24], w[23], selector); w[28] = __byte_perm (w[23], w[22], selector); w[27] = __byte_perm (w[22], w[21], selector); w[26] = __byte_perm (w[21], w[20], selector); w[25] = __byte_perm (w[20], w[19], selector); w[24] = __byte_perm (w[19], w[18], selector); w[23] = __byte_perm (w[18], w[17], selector); w[22] = __byte_perm (w[17], w[16], selector); w[21] = __byte_perm (w[16], w[15], selector); w[20] = __byte_perm (w[15], w[14], selector); w[19] = __byte_perm (w[14], w[13], selector); w[18] = __byte_perm (w[13], w[12], selector); w[17] = __byte_perm (w[12], w[11], selector); w[16] = __byte_perm (w[11], w[10], selector); w[15] = __byte_perm (w[10], w[ 9], selector); w[14] = __byte_perm (w[ 9], w[ 8], selector); w[13] = __byte_perm (w[ 8], w[ 7], selector); w[12] = __byte_perm (w[ 7], w[ 6], selector); w[11] = __byte_perm (w[ 6], w[ 5], selector); w[10] = __byte_perm (w[ 5], w[ 4], selector); w[ 9] = __byte_perm (w[ 4], w[ 3], selector); w[ 8] = __byte_perm (w[ 3], w[ 2], selector); w[ 7] = __byte_perm (w[ 2], w[ 1], selector); w[ 6] = __byte_perm (w[ 1], w[ 0], selector); w[ 5] = __byte_perm (w[ 0], 0, selector); w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 6: w[63] = __byte_perm (w[57], w[56], selector); w[62] = __byte_perm (w[56], w[55], selector); w[61] = __byte_perm (w[55], w[54], selector); w[60] = __byte_perm (w[54], w[53], selector); w[59] = __byte_perm (w[53], w[52], selector); w[58] = __byte_perm (w[52], w[51], selector); w[57] = __byte_perm (w[51], w[50], selector); w[56] = __byte_perm (w[50], w[49], selector); w[55] = __byte_perm (w[49], w[48], selector); w[54] = __byte_perm (w[48], w[47], selector); w[53] = __byte_perm (w[47], w[46], selector); w[52] = __byte_perm (w[46], w[45], selector); w[51] = __byte_perm (w[45], w[44], selector); w[50] = __byte_perm (w[44], w[43], selector); w[49] = __byte_perm (w[43], w[42], selector); w[48] = __byte_perm (w[42], w[41], selector); w[47] = __byte_perm (w[41], w[40], selector); w[46] = __byte_perm (w[40], w[39], selector); w[45] = __byte_perm (w[39], w[38], selector); w[44] = __byte_perm (w[38], w[37], selector); w[43] = __byte_perm (w[37], w[36], selector); w[42] = __byte_perm (w[36], w[35], selector); w[41] = __byte_perm (w[35], w[34], selector); w[40] = __byte_perm (w[34], w[33], selector); w[39] = __byte_perm (w[33], w[32], selector); w[38] = __byte_perm (w[32], w[31], selector); w[37] = __byte_perm (w[31], w[30], selector); w[36] = __byte_perm (w[30], w[29], selector); w[35] = __byte_perm (w[29], w[28], selector); w[34] = __byte_perm (w[28], w[27], selector); w[33] = __byte_perm (w[27], w[26], selector); w[32] = __byte_perm (w[26], w[25], selector); w[31] = __byte_perm (w[25], w[24], selector); w[30] = __byte_perm (w[24], w[23], selector); w[29] = __byte_perm (w[23], w[22], selector); w[28] = __byte_perm (w[22], w[21], selector); w[27] = __byte_perm (w[21], w[20], selector); w[26] = __byte_perm (w[20], w[19], selector); w[25] = __byte_perm (w[19], w[18], selector); w[24] = __byte_perm (w[18], w[17], selector); w[23] = __byte_perm (w[17], w[16], selector); w[22] = __byte_perm (w[16], w[15], selector); w[21] = __byte_perm (w[15], w[14], selector); w[20] = __byte_perm (w[14], w[13], selector); w[19] = __byte_perm (w[13], w[12], selector); w[18] = __byte_perm (w[12], w[11], selector); w[17] = __byte_perm (w[11], w[10], selector); w[16] = __byte_perm (w[10], w[ 9], selector); w[15] = __byte_perm (w[ 9], w[ 8], selector); w[14] = __byte_perm (w[ 8], w[ 7], selector); w[13] = __byte_perm (w[ 7], w[ 6], selector); w[12] = __byte_perm (w[ 6], w[ 5], selector); w[11] = __byte_perm (w[ 5], w[ 4], selector); w[10] = __byte_perm (w[ 4], w[ 3], selector); w[ 9] = __byte_perm (w[ 3], w[ 2], selector); w[ 8] = __byte_perm (w[ 2], w[ 1], selector); w[ 7] = __byte_perm (w[ 1], w[ 0], selector); w[ 6] = __byte_perm (w[ 0], 0, selector); w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 7: w[63] = __byte_perm (w[56], w[55], selector); w[62] = __byte_perm (w[55], w[54], selector); w[61] = __byte_perm (w[54], w[53], selector); w[60] = __byte_perm (w[53], w[52], selector); w[59] = __byte_perm (w[52], w[51], selector); w[58] = __byte_perm (w[51], w[50], selector); w[57] = __byte_perm (w[50], w[49], selector); w[56] = __byte_perm (w[49], w[48], selector); w[55] = __byte_perm (w[48], w[47], selector); w[54] = __byte_perm (w[47], w[46], selector); w[53] = __byte_perm (w[46], w[45], selector); w[52] = __byte_perm (w[45], w[44], selector); w[51] = __byte_perm (w[44], w[43], selector); w[50] = __byte_perm (w[43], w[42], selector); w[49] = __byte_perm (w[42], w[41], selector); w[48] = __byte_perm (w[41], w[40], selector); w[47] = __byte_perm (w[40], w[39], selector); w[46] = __byte_perm (w[39], w[38], selector); w[45] = __byte_perm (w[38], w[37], selector); w[44] = __byte_perm (w[37], w[36], selector); w[43] = __byte_perm (w[36], w[35], selector); w[42] = __byte_perm (w[35], w[34], selector); w[41] = __byte_perm (w[34], w[33], selector); w[40] = __byte_perm (w[33], w[32], selector); w[39] = __byte_perm (w[32], w[31], selector); w[38] = __byte_perm (w[31], w[30], selector); w[37] = __byte_perm (w[30], w[29], selector); w[36] = __byte_perm (w[29], w[28], selector); w[35] = __byte_perm (w[28], w[27], selector); w[34] = __byte_perm (w[27], w[26], selector); w[33] = __byte_perm (w[26], w[25], selector); w[32] = __byte_perm (w[25], w[24], selector); w[31] = __byte_perm (w[24], w[23], selector); w[30] = __byte_perm (w[23], w[22], selector); w[29] = __byte_perm (w[22], w[21], selector); w[28] = __byte_perm (w[21], w[20], selector); w[27] = __byte_perm (w[20], w[19], selector); w[26] = __byte_perm (w[19], w[18], selector); w[25] = __byte_perm (w[18], w[17], selector); w[24] = __byte_perm (w[17], w[16], selector); w[23] = __byte_perm (w[16], w[15], selector); w[22] = __byte_perm (w[15], w[14], selector); w[21] = __byte_perm (w[14], w[13], selector); w[20] = __byte_perm (w[13], w[12], selector); w[19] = __byte_perm (w[12], w[11], selector); w[18] = __byte_perm (w[11], w[10], selector); w[17] = __byte_perm (w[10], w[ 9], selector); w[16] = __byte_perm (w[ 9], w[ 8], selector); w[15] = __byte_perm (w[ 8], w[ 7], selector); w[14] = __byte_perm (w[ 7], w[ 6], selector); w[13] = __byte_perm (w[ 6], w[ 5], selector); w[12] = __byte_perm (w[ 5], w[ 4], selector); w[11] = __byte_perm (w[ 4], w[ 3], selector); w[10] = __byte_perm (w[ 3], w[ 2], selector); w[ 9] = __byte_perm (w[ 2], w[ 1], selector); w[ 8] = __byte_perm (w[ 1], w[ 0], selector); w[ 7] = __byte_perm (w[ 0], 0, selector); w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 8: w[63] = __byte_perm (w[55], w[54], selector); w[62] = __byte_perm (w[54], w[53], selector); w[61] = __byte_perm (w[53], w[52], selector); w[60] = __byte_perm (w[52], w[51], selector); w[59] = __byte_perm (w[51], w[50], selector); w[58] = __byte_perm (w[50], w[49], selector); w[57] = __byte_perm (w[49], w[48], selector); w[56] = __byte_perm (w[48], w[47], selector); w[55] = __byte_perm (w[47], w[46], selector); w[54] = __byte_perm (w[46], w[45], selector); w[53] = __byte_perm (w[45], w[44], selector); w[52] = __byte_perm (w[44], w[43], selector); w[51] = __byte_perm (w[43], w[42], selector); w[50] = __byte_perm (w[42], w[41], selector); w[49] = __byte_perm (w[41], w[40], selector); w[48] = __byte_perm (w[40], w[39], selector); w[47] = __byte_perm (w[39], w[38], selector); w[46] = __byte_perm (w[38], w[37], selector); w[45] = __byte_perm (w[37], w[36], selector); w[44] = __byte_perm (w[36], w[35], selector); w[43] = __byte_perm (w[35], w[34], selector); w[42] = __byte_perm (w[34], w[33], selector); w[41] = __byte_perm (w[33], w[32], selector); w[40] = __byte_perm (w[32], w[31], selector); w[39] = __byte_perm (w[31], w[30], selector); w[38] = __byte_perm (w[30], w[29], selector); w[37] = __byte_perm (w[29], w[28], selector); w[36] = __byte_perm (w[28], w[27], selector); w[35] = __byte_perm (w[27], w[26], selector); w[34] = __byte_perm (w[26], w[25], selector); w[33] = __byte_perm (w[25], w[24], selector); w[32] = __byte_perm (w[24], w[23], selector); w[31] = __byte_perm (w[23], w[22], selector); w[30] = __byte_perm (w[22], w[21], selector); w[29] = __byte_perm (w[21], w[20], selector); w[28] = __byte_perm (w[20], w[19], selector); w[27] = __byte_perm (w[19], w[18], selector); w[26] = __byte_perm (w[18], w[17], selector); w[25] = __byte_perm (w[17], w[16], selector); w[24] = __byte_perm (w[16], w[15], selector); w[23] = __byte_perm (w[15], w[14], selector); w[22] = __byte_perm (w[14], w[13], selector); w[21] = __byte_perm (w[13], w[12], selector); w[20] = __byte_perm (w[12], w[11], selector); w[19] = __byte_perm (w[11], w[10], selector); w[18] = __byte_perm (w[10], w[ 9], selector); w[17] = __byte_perm (w[ 9], w[ 8], selector); w[16] = __byte_perm (w[ 8], w[ 7], selector); w[15] = __byte_perm (w[ 7], w[ 6], selector); w[14] = __byte_perm (w[ 6], w[ 5], selector); w[13] = __byte_perm (w[ 5], w[ 4], selector); w[12] = __byte_perm (w[ 4], w[ 3], selector); w[11] = __byte_perm (w[ 3], w[ 2], selector); w[10] = __byte_perm (w[ 2], w[ 1], selector); w[ 9] = __byte_perm (w[ 1], w[ 0], selector); w[ 8] = __byte_perm (w[ 0], 0, selector); w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 9: w[63] = __byte_perm (w[54], w[53], selector); w[62] = __byte_perm (w[53], w[52], selector); w[61] = __byte_perm (w[52], w[51], selector); w[60] = __byte_perm (w[51], w[50], selector); w[59] = __byte_perm (w[50], w[49], selector); w[58] = __byte_perm (w[49], w[48], selector); w[57] = __byte_perm (w[48], w[47], selector); w[56] = __byte_perm (w[47], w[46], selector); w[55] = __byte_perm (w[46], w[45], selector); w[54] = __byte_perm (w[45], w[44], selector); w[53] = __byte_perm (w[44], w[43], selector); w[52] = __byte_perm (w[43], w[42], selector); w[51] = __byte_perm (w[42], w[41], selector); w[50] = __byte_perm (w[41], w[40], selector); w[49] = __byte_perm (w[40], w[39], selector); w[48] = __byte_perm (w[39], w[38], selector); w[47] = __byte_perm (w[38], w[37], selector); w[46] = __byte_perm (w[37], w[36], selector); w[45] = __byte_perm (w[36], w[35], selector); w[44] = __byte_perm (w[35], w[34], selector); w[43] = __byte_perm (w[34], w[33], selector); w[42] = __byte_perm (w[33], w[32], selector); w[41] = __byte_perm (w[32], w[31], selector); w[40] = __byte_perm (w[31], w[30], selector); w[39] = __byte_perm (w[30], w[29], selector); w[38] = __byte_perm (w[29], w[28], selector); w[37] = __byte_perm (w[28], w[27], selector); w[36] = __byte_perm (w[27], w[26], selector); w[35] = __byte_perm (w[26], w[25], selector); w[34] = __byte_perm (w[25], w[24], selector); w[33] = __byte_perm (w[24], w[23], selector); w[32] = __byte_perm (w[23], w[22], selector); w[31] = __byte_perm (w[22], w[21], selector); w[30] = __byte_perm (w[21], w[20], selector); w[29] = __byte_perm (w[20], w[19], selector); w[28] = __byte_perm (w[19], w[18], selector); w[27] = __byte_perm (w[18], w[17], selector); w[26] = __byte_perm (w[17], w[16], selector); w[25] = __byte_perm (w[16], w[15], selector); w[24] = __byte_perm (w[15], w[14], selector); w[23] = __byte_perm (w[14], w[13], selector); w[22] = __byte_perm (w[13], w[12], selector); w[21] = __byte_perm (w[12], w[11], selector); w[20] = __byte_perm (w[11], w[10], selector); w[19] = __byte_perm (w[10], w[ 9], selector); w[18] = __byte_perm (w[ 9], w[ 8], selector); w[17] = __byte_perm (w[ 8], w[ 7], selector); w[16] = __byte_perm (w[ 7], w[ 6], selector); w[15] = __byte_perm (w[ 6], w[ 5], selector); w[14] = __byte_perm (w[ 5], w[ 4], selector); w[13] = __byte_perm (w[ 4], w[ 3], selector); w[12] = __byte_perm (w[ 3], w[ 2], selector); w[11] = __byte_perm (w[ 2], w[ 1], selector); w[10] = __byte_perm (w[ 1], w[ 0], selector); w[ 9] = __byte_perm (w[ 0], 0, selector); w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 10: w[63] = __byte_perm (w[53], w[52], selector); w[62] = __byte_perm (w[52], w[51], selector); w[61] = __byte_perm (w[51], w[50], selector); w[60] = __byte_perm (w[50], w[49], selector); w[59] = __byte_perm (w[49], w[48], selector); w[58] = __byte_perm (w[48], w[47], selector); w[57] = __byte_perm (w[47], w[46], selector); w[56] = __byte_perm (w[46], w[45], selector); w[55] = __byte_perm (w[45], w[44], selector); w[54] = __byte_perm (w[44], w[43], selector); w[53] = __byte_perm (w[43], w[42], selector); w[52] = __byte_perm (w[42], w[41], selector); w[51] = __byte_perm (w[41], w[40], selector); w[50] = __byte_perm (w[40], w[39], selector); w[49] = __byte_perm (w[39], w[38], selector); w[48] = __byte_perm (w[38], w[37], selector); w[47] = __byte_perm (w[37], w[36], selector); w[46] = __byte_perm (w[36], w[35], selector); w[45] = __byte_perm (w[35], w[34], selector); w[44] = __byte_perm (w[34], w[33], selector); w[43] = __byte_perm (w[33], w[32], selector); w[42] = __byte_perm (w[32], w[31], selector); w[41] = __byte_perm (w[31], w[30], selector); w[40] = __byte_perm (w[30], w[29], selector); w[39] = __byte_perm (w[29], w[28], selector); w[38] = __byte_perm (w[28], w[27], selector); w[37] = __byte_perm (w[27], w[26], selector); w[36] = __byte_perm (w[26], w[25], selector); w[35] = __byte_perm (w[25], w[24], selector); w[34] = __byte_perm (w[24], w[23], selector); w[33] = __byte_perm (w[23], w[22], selector); w[32] = __byte_perm (w[22], w[21], selector); w[31] = __byte_perm (w[21], w[20], selector); w[30] = __byte_perm (w[20], w[19], selector); w[29] = __byte_perm (w[19], w[18], selector); w[28] = __byte_perm (w[18], w[17], selector); w[27] = __byte_perm (w[17], w[16], selector); w[26] = __byte_perm (w[16], w[15], selector); w[25] = __byte_perm (w[15], w[14], selector); w[24] = __byte_perm (w[14], w[13], selector); w[23] = __byte_perm (w[13], w[12], selector); w[22] = __byte_perm (w[12], w[11], selector); w[21] = __byte_perm (w[11], w[10], selector); w[20] = __byte_perm (w[10], w[ 9], selector); w[19] = __byte_perm (w[ 9], w[ 8], selector); w[18] = __byte_perm (w[ 8], w[ 7], selector); w[17] = __byte_perm (w[ 7], w[ 6], selector); w[16] = __byte_perm (w[ 6], w[ 5], selector); w[15] = __byte_perm (w[ 5], w[ 4], selector); w[14] = __byte_perm (w[ 4], w[ 3], selector); w[13] = __byte_perm (w[ 3], w[ 2], selector); w[12] = __byte_perm (w[ 2], w[ 1], selector); w[11] = __byte_perm (w[ 1], w[ 0], selector); w[10] = __byte_perm (w[ 0], 0, selector); w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 11: w[63] = __byte_perm (w[52], w[51], selector); w[62] = __byte_perm (w[51], w[50], selector); w[61] = __byte_perm (w[50], w[49], selector); w[60] = __byte_perm (w[49], w[48], selector); w[59] = __byte_perm (w[48], w[47], selector); w[58] = __byte_perm (w[47], w[46], selector); w[57] = __byte_perm (w[46], w[45], selector); w[56] = __byte_perm (w[45], w[44], selector); w[55] = __byte_perm (w[44], w[43], selector); w[54] = __byte_perm (w[43], w[42], selector); w[53] = __byte_perm (w[42], w[41], selector); w[52] = __byte_perm (w[41], w[40], selector); w[51] = __byte_perm (w[40], w[39], selector); w[50] = __byte_perm (w[39], w[38], selector); w[49] = __byte_perm (w[38], w[37], selector); w[48] = __byte_perm (w[37], w[36], selector); w[47] = __byte_perm (w[36], w[35], selector); w[46] = __byte_perm (w[35], w[34], selector); w[45] = __byte_perm (w[34], w[33], selector); w[44] = __byte_perm (w[33], w[32], selector); w[43] = __byte_perm (w[32], w[31], selector); w[42] = __byte_perm (w[31], w[30], selector); w[41] = __byte_perm (w[30], w[29], selector); w[40] = __byte_perm (w[29], w[28], selector); w[39] = __byte_perm (w[28], w[27], selector); w[38] = __byte_perm (w[27], w[26], selector); w[37] = __byte_perm (w[26], w[25], selector); w[36] = __byte_perm (w[25], w[24], selector); w[35] = __byte_perm (w[24], w[23], selector); w[34] = __byte_perm (w[23], w[22], selector); w[33] = __byte_perm (w[22], w[21], selector); w[32] = __byte_perm (w[21], w[20], selector); w[31] = __byte_perm (w[20], w[19], selector); w[30] = __byte_perm (w[19], w[18], selector); w[29] = __byte_perm (w[18], w[17], selector); w[28] = __byte_perm (w[17], w[16], selector); w[27] = __byte_perm (w[16], w[15], selector); w[26] = __byte_perm (w[15], w[14], selector); w[25] = __byte_perm (w[14], w[13], selector); w[24] = __byte_perm (w[13], w[12], selector); w[23] = __byte_perm (w[12], w[11], selector); w[22] = __byte_perm (w[11], w[10], selector); w[21] = __byte_perm (w[10], w[ 9], selector); w[20] = __byte_perm (w[ 9], w[ 8], selector); w[19] = __byte_perm (w[ 8], w[ 7], selector); w[18] = __byte_perm (w[ 7], w[ 6], selector); w[17] = __byte_perm (w[ 6], w[ 5], selector); w[16] = __byte_perm (w[ 5], w[ 4], selector); w[15] = __byte_perm (w[ 4], w[ 3], selector); w[14] = __byte_perm (w[ 3], w[ 2], selector); w[13] = __byte_perm (w[ 2], w[ 1], selector); w[12] = __byte_perm (w[ 1], w[ 0], selector); w[11] = __byte_perm (w[ 0], 0, selector); w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 12: w[63] = __byte_perm (w[51], w[50], selector); w[62] = __byte_perm (w[50], w[49], selector); w[61] = __byte_perm (w[49], w[48], selector); w[60] = __byte_perm (w[48], w[47], selector); w[59] = __byte_perm (w[47], w[46], selector); w[58] = __byte_perm (w[46], w[45], selector); w[57] = __byte_perm (w[45], w[44], selector); w[56] = __byte_perm (w[44], w[43], selector); w[55] = __byte_perm (w[43], w[42], selector); w[54] = __byte_perm (w[42], w[41], selector); w[53] = __byte_perm (w[41], w[40], selector); w[52] = __byte_perm (w[40], w[39], selector); w[51] = __byte_perm (w[39], w[38], selector); w[50] = __byte_perm (w[38], w[37], selector); w[49] = __byte_perm (w[37], w[36], selector); w[48] = __byte_perm (w[36], w[35], selector); w[47] = __byte_perm (w[35], w[34], selector); w[46] = __byte_perm (w[34], w[33], selector); w[45] = __byte_perm (w[33], w[32], selector); w[44] = __byte_perm (w[32], w[31], selector); w[43] = __byte_perm (w[31], w[30], selector); w[42] = __byte_perm (w[30], w[29], selector); w[41] = __byte_perm (w[29], w[28], selector); w[40] = __byte_perm (w[28], w[27], selector); w[39] = __byte_perm (w[27], w[26], selector); w[38] = __byte_perm (w[26], w[25], selector); w[37] = __byte_perm (w[25], w[24], selector); w[36] = __byte_perm (w[24], w[23], selector); w[35] = __byte_perm (w[23], w[22], selector); w[34] = __byte_perm (w[22], w[21], selector); w[33] = __byte_perm (w[21], w[20], selector); w[32] = __byte_perm (w[20], w[19], selector); w[31] = __byte_perm (w[19], w[18], selector); w[30] = __byte_perm (w[18], w[17], selector); w[29] = __byte_perm (w[17], w[16], selector); w[28] = __byte_perm (w[16], w[15], selector); w[27] = __byte_perm (w[15], w[14], selector); w[26] = __byte_perm (w[14], w[13], selector); w[25] = __byte_perm (w[13], w[12], selector); w[24] = __byte_perm (w[12], w[11], selector); w[23] = __byte_perm (w[11], w[10], selector); w[22] = __byte_perm (w[10], w[ 9], selector); w[21] = __byte_perm (w[ 9], w[ 8], selector); w[20] = __byte_perm (w[ 8], w[ 7], selector); w[19] = __byte_perm (w[ 7], w[ 6], selector); w[18] = __byte_perm (w[ 6], w[ 5], selector); w[17] = __byte_perm (w[ 5], w[ 4], selector); w[16] = __byte_perm (w[ 4], w[ 3], selector); w[15] = __byte_perm (w[ 3], w[ 2], selector); w[14] = __byte_perm (w[ 2], w[ 1], selector); w[13] = __byte_perm (w[ 1], w[ 0], selector); w[12] = __byte_perm (w[ 0], 0, selector); w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 13: w[63] = __byte_perm (w[50], w[49], selector); w[62] = __byte_perm (w[49], w[48], selector); w[61] = __byte_perm (w[48], w[47], selector); w[60] = __byte_perm (w[47], w[46], selector); w[59] = __byte_perm (w[46], w[45], selector); w[58] = __byte_perm (w[45], w[44], selector); w[57] = __byte_perm (w[44], w[43], selector); w[56] = __byte_perm (w[43], w[42], selector); w[55] = __byte_perm (w[42], w[41], selector); w[54] = __byte_perm (w[41], w[40], selector); w[53] = __byte_perm (w[40], w[39], selector); w[52] = __byte_perm (w[39], w[38], selector); w[51] = __byte_perm (w[38], w[37], selector); w[50] = __byte_perm (w[37], w[36], selector); w[49] = __byte_perm (w[36], w[35], selector); w[48] = __byte_perm (w[35], w[34], selector); w[47] = __byte_perm (w[34], w[33], selector); w[46] = __byte_perm (w[33], w[32], selector); w[45] = __byte_perm (w[32], w[31], selector); w[44] = __byte_perm (w[31], w[30], selector); w[43] = __byte_perm (w[30], w[29], selector); w[42] = __byte_perm (w[29], w[28], selector); w[41] = __byte_perm (w[28], w[27], selector); w[40] = __byte_perm (w[27], w[26], selector); w[39] = __byte_perm (w[26], w[25], selector); w[38] = __byte_perm (w[25], w[24], selector); w[37] = __byte_perm (w[24], w[23], selector); w[36] = __byte_perm (w[23], w[22], selector); w[35] = __byte_perm (w[22], w[21], selector); w[34] = __byte_perm (w[21], w[20], selector); w[33] = __byte_perm (w[20], w[19], selector); w[32] = __byte_perm (w[19], w[18], selector); w[31] = __byte_perm (w[18], w[17], selector); w[30] = __byte_perm (w[17], w[16], selector); w[29] = __byte_perm (w[16], w[15], selector); w[28] = __byte_perm (w[15], w[14], selector); w[27] = __byte_perm (w[14], w[13], selector); w[26] = __byte_perm (w[13], w[12], selector); w[25] = __byte_perm (w[12], w[11], selector); w[24] = __byte_perm (w[11], w[10], selector); w[23] = __byte_perm (w[10], w[ 9], selector); w[22] = __byte_perm (w[ 9], w[ 8], selector); w[21] = __byte_perm (w[ 8], w[ 7], selector); w[20] = __byte_perm (w[ 7], w[ 6], selector); w[19] = __byte_perm (w[ 6], w[ 5], selector); w[18] = __byte_perm (w[ 5], w[ 4], selector); w[17] = __byte_perm (w[ 4], w[ 3], selector); w[16] = __byte_perm (w[ 3], w[ 2], selector); w[15] = __byte_perm (w[ 2], w[ 1], selector); w[14] = __byte_perm (w[ 1], w[ 0], selector); w[13] = __byte_perm (w[ 0], 0, selector); w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 14: w[63] = __byte_perm (w[49], w[48], selector); w[62] = __byte_perm (w[48], w[47], selector); w[61] = __byte_perm (w[47], w[46], selector); w[60] = __byte_perm (w[46], w[45], selector); w[59] = __byte_perm (w[45], w[44], selector); w[58] = __byte_perm (w[44], w[43], selector); w[57] = __byte_perm (w[43], w[42], selector); w[56] = __byte_perm (w[42], w[41], selector); w[55] = __byte_perm (w[41], w[40], selector); w[54] = __byte_perm (w[40], w[39], selector); w[53] = __byte_perm (w[39], w[38], selector); w[52] = __byte_perm (w[38], w[37], selector); w[51] = __byte_perm (w[37], w[36], selector); w[50] = __byte_perm (w[36], w[35], selector); w[49] = __byte_perm (w[35], w[34], selector); w[48] = __byte_perm (w[34], w[33], selector); w[47] = __byte_perm (w[33], w[32], selector); w[46] = __byte_perm (w[32], w[31], selector); w[45] = __byte_perm (w[31], w[30], selector); w[44] = __byte_perm (w[30], w[29], selector); w[43] = __byte_perm (w[29], w[28], selector); w[42] = __byte_perm (w[28], w[27], selector); w[41] = __byte_perm (w[27], w[26], selector); w[40] = __byte_perm (w[26], w[25], selector); w[39] = __byte_perm (w[25], w[24], selector); w[38] = __byte_perm (w[24], w[23], selector); w[37] = __byte_perm (w[23], w[22], selector); w[36] = __byte_perm (w[22], w[21], selector); w[35] = __byte_perm (w[21], w[20], selector); w[34] = __byte_perm (w[20], w[19], selector); w[33] = __byte_perm (w[19], w[18], selector); w[32] = __byte_perm (w[18], w[17], selector); w[31] = __byte_perm (w[17], w[16], selector); w[30] = __byte_perm (w[16], w[15], selector); w[29] = __byte_perm (w[15], w[14], selector); w[28] = __byte_perm (w[14], w[13], selector); w[27] = __byte_perm (w[13], w[12], selector); w[26] = __byte_perm (w[12], w[11], selector); w[25] = __byte_perm (w[11], w[10], selector); w[24] = __byte_perm (w[10], w[ 9], selector); w[23] = __byte_perm (w[ 9], w[ 8], selector); w[22] = __byte_perm (w[ 8], w[ 7], selector); w[21] = __byte_perm (w[ 7], w[ 6], selector); w[20] = __byte_perm (w[ 6], w[ 5], selector); w[19] = __byte_perm (w[ 5], w[ 4], selector); w[18] = __byte_perm (w[ 4], w[ 3], selector); w[17] = __byte_perm (w[ 3], w[ 2], selector); w[16] = __byte_perm (w[ 2], w[ 1], selector); w[15] = __byte_perm (w[ 1], w[ 0], selector); w[14] = __byte_perm (w[ 0], 0, selector); w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 15: w[63] = __byte_perm (w[48], w[47], selector); w[62] = __byte_perm (w[47], w[46], selector); w[61] = __byte_perm (w[46], w[45], selector); w[60] = __byte_perm (w[45], w[44], selector); w[59] = __byte_perm (w[44], w[43], selector); w[58] = __byte_perm (w[43], w[42], selector); w[57] = __byte_perm (w[42], w[41], selector); w[56] = __byte_perm (w[41], w[40], selector); w[55] = __byte_perm (w[40], w[39], selector); w[54] = __byte_perm (w[39], w[38], selector); w[53] = __byte_perm (w[38], w[37], selector); w[52] = __byte_perm (w[37], w[36], selector); w[51] = __byte_perm (w[36], w[35], selector); w[50] = __byte_perm (w[35], w[34], selector); w[49] = __byte_perm (w[34], w[33], selector); w[48] = __byte_perm (w[33], w[32], selector); w[47] = __byte_perm (w[32], w[31], selector); w[46] = __byte_perm (w[31], w[30], selector); w[45] = __byte_perm (w[30], w[29], selector); w[44] = __byte_perm (w[29], w[28], selector); w[43] = __byte_perm (w[28], w[27], selector); w[42] = __byte_perm (w[27], w[26], selector); w[41] = __byte_perm (w[26], w[25], selector); w[40] = __byte_perm (w[25], w[24], selector); w[39] = __byte_perm (w[24], w[23], selector); w[38] = __byte_perm (w[23], w[22], selector); w[37] = __byte_perm (w[22], w[21], selector); w[36] = __byte_perm (w[21], w[20], selector); w[35] = __byte_perm (w[20], w[19], selector); w[34] = __byte_perm (w[19], w[18], selector); w[33] = __byte_perm (w[18], w[17], selector); w[32] = __byte_perm (w[17], w[16], selector); w[31] = __byte_perm (w[16], w[15], selector); w[30] = __byte_perm (w[15], w[14], selector); w[29] = __byte_perm (w[14], w[13], selector); w[28] = __byte_perm (w[13], w[12], selector); w[27] = __byte_perm (w[12], w[11], selector); w[26] = __byte_perm (w[11], w[10], selector); w[25] = __byte_perm (w[10], w[ 9], selector); w[24] = __byte_perm (w[ 9], w[ 8], selector); w[23] = __byte_perm (w[ 8], w[ 7], selector); w[22] = __byte_perm (w[ 7], w[ 6], selector); w[21] = __byte_perm (w[ 6], w[ 5], selector); w[20] = __byte_perm (w[ 5], w[ 4], selector); w[19] = __byte_perm (w[ 4], w[ 3], selector); w[18] = __byte_perm (w[ 3], w[ 2], selector); w[17] = __byte_perm (w[ 2], w[ 1], selector); w[16] = __byte_perm (w[ 1], w[ 0], selector); w[15] = __byte_perm (w[ 0], 0, selector); w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 16: w[63] = __byte_perm (w[47], w[46], selector); w[62] = __byte_perm (w[46], w[45], selector); w[61] = __byte_perm (w[45], w[44], selector); w[60] = __byte_perm (w[44], w[43], selector); w[59] = __byte_perm (w[43], w[42], selector); w[58] = __byte_perm (w[42], w[41], selector); w[57] = __byte_perm (w[41], w[40], selector); w[56] = __byte_perm (w[40], w[39], selector); w[55] = __byte_perm (w[39], w[38], selector); w[54] = __byte_perm (w[38], w[37], selector); w[53] = __byte_perm (w[37], w[36], selector); w[52] = __byte_perm (w[36], w[35], selector); w[51] = __byte_perm (w[35], w[34], selector); w[50] = __byte_perm (w[34], w[33], selector); w[49] = __byte_perm (w[33], w[32], selector); w[48] = __byte_perm (w[32], w[31], selector); w[47] = __byte_perm (w[31], w[30], selector); w[46] = __byte_perm (w[30], w[29], selector); w[45] = __byte_perm (w[29], w[28], selector); w[44] = __byte_perm (w[28], w[27], selector); w[43] = __byte_perm (w[27], w[26], selector); w[42] = __byte_perm (w[26], w[25], selector); w[41] = __byte_perm (w[25], w[24], selector); w[40] = __byte_perm (w[24], w[23], selector); w[39] = __byte_perm (w[23], w[22], selector); w[38] = __byte_perm (w[22], w[21], selector); w[37] = __byte_perm (w[21], w[20], selector); w[36] = __byte_perm (w[20], w[19], selector); w[35] = __byte_perm (w[19], w[18], selector); w[34] = __byte_perm (w[18], w[17], selector); w[33] = __byte_perm (w[17], w[16], selector); w[32] = __byte_perm (w[16], w[15], selector); w[31] = __byte_perm (w[15], w[14], selector); w[30] = __byte_perm (w[14], w[13], selector); w[29] = __byte_perm (w[13], w[12], selector); w[28] = __byte_perm (w[12], w[11], selector); w[27] = __byte_perm (w[11], w[10], selector); w[26] = __byte_perm (w[10], w[ 9], selector); w[25] = __byte_perm (w[ 9], w[ 8], selector); w[24] = __byte_perm (w[ 8], w[ 7], selector); w[23] = __byte_perm (w[ 7], w[ 6], selector); w[22] = __byte_perm (w[ 6], w[ 5], selector); w[21] = __byte_perm (w[ 5], w[ 4], selector); w[20] = __byte_perm (w[ 4], w[ 3], selector); w[19] = __byte_perm (w[ 3], w[ 2], selector); w[18] = __byte_perm (w[ 2], w[ 1], selector); w[17] = __byte_perm (w[ 1], w[ 0], selector); w[16] = __byte_perm (w[ 0], 0, selector); w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 17: w[63] = __byte_perm (w[46], w[45], selector); w[62] = __byte_perm (w[45], w[44], selector); w[61] = __byte_perm (w[44], w[43], selector); w[60] = __byte_perm (w[43], w[42], selector); w[59] = __byte_perm (w[42], w[41], selector); w[58] = __byte_perm (w[41], w[40], selector); w[57] = __byte_perm (w[40], w[39], selector); w[56] = __byte_perm (w[39], w[38], selector); w[55] = __byte_perm (w[38], w[37], selector); w[54] = __byte_perm (w[37], w[36], selector); w[53] = __byte_perm (w[36], w[35], selector); w[52] = __byte_perm (w[35], w[34], selector); w[51] = __byte_perm (w[34], w[33], selector); w[50] = __byte_perm (w[33], w[32], selector); w[49] = __byte_perm (w[32], w[31], selector); w[48] = __byte_perm (w[31], w[30], selector); w[47] = __byte_perm (w[30], w[29], selector); w[46] = __byte_perm (w[29], w[28], selector); w[45] = __byte_perm (w[28], w[27], selector); w[44] = __byte_perm (w[27], w[26], selector); w[43] = __byte_perm (w[26], w[25], selector); w[42] = __byte_perm (w[25], w[24], selector); w[41] = __byte_perm (w[24], w[23], selector); w[40] = __byte_perm (w[23], w[22], selector); w[39] = __byte_perm (w[22], w[21], selector); w[38] = __byte_perm (w[21], w[20], selector); w[37] = __byte_perm (w[20], w[19], selector); w[36] = __byte_perm (w[19], w[18], selector); w[35] = __byte_perm (w[18], w[17], selector); w[34] = __byte_perm (w[17], w[16], selector); w[33] = __byte_perm (w[16], w[15], selector); w[32] = __byte_perm (w[15], w[14], selector); w[31] = __byte_perm (w[14], w[13], selector); w[30] = __byte_perm (w[13], w[12], selector); w[29] = __byte_perm (w[12], w[11], selector); w[28] = __byte_perm (w[11], w[10], selector); w[27] = __byte_perm (w[10], w[ 9], selector); w[26] = __byte_perm (w[ 9], w[ 8], selector); w[25] = __byte_perm (w[ 8], w[ 7], selector); w[24] = __byte_perm (w[ 7], w[ 6], selector); w[23] = __byte_perm (w[ 6], w[ 5], selector); w[22] = __byte_perm (w[ 5], w[ 4], selector); w[21] = __byte_perm (w[ 4], w[ 3], selector); w[20] = __byte_perm (w[ 3], w[ 2], selector); w[19] = __byte_perm (w[ 2], w[ 1], selector); w[18] = __byte_perm (w[ 1], w[ 0], selector); w[17] = __byte_perm (w[ 0], 0, selector); w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 18: w[63] = __byte_perm (w[45], w[44], selector); w[62] = __byte_perm (w[44], w[43], selector); w[61] = __byte_perm (w[43], w[42], selector); w[60] = __byte_perm (w[42], w[41], selector); w[59] = __byte_perm (w[41], w[40], selector); w[58] = __byte_perm (w[40], w[39], selector); w[57] = __byte_perm (w[39], w[38], selector); w[56] = __byte_perm (w[38], w[37], selector); w[55] = __byte_perm (w[37], w[36], selector); w[54] = __byte_perm (w[36], w[35], selector); w[53] = __byte_perm (w[35], w[34], selector); w[52] = __byte_perm (w[34], w[33], selector); w[51] = __byte_perm (w[33], w[32], selector); w[50] = __byte_perm (w[32], w[31], selector); w[49] = __byte_perm (w[31], w[30], selector); w[48] = __byte_perm (w[30], w[29], selector); w[47] = __byte_perm (w[29], w[28], selector); w[46] = __byte_perm (w[28], w[27], selector); w[45] = __byte_perm (w[27], w[26], selector); w[44] = __byte_perm (w[26], w[25], selector); w[43] = __byte_perm (w[25], w[24], selector); w[42] = __byte_perm (w[24], w[23], selector); w[41] = __byte_perm (w[23], w[22], selector); w[40] = __byte_perm (w[22], w[21], selector); w[39] = __byte_perm (w[21], w[20], selector); w[38] = __byte_perm (w[20], w[19], selector); w[37] = __byte_perm (w[19], w[18], selector); w[36] = __byte_perm (w[18], w[17], selector); w[35] = __byte_perm (w[17], w[16], selector); w[34] = __byte_perm (w[16], w[15], selector); w[33] = __byte_perm (w[15], w[14], selector); w[32] = __byte_perm (w[14], w[13], selector); w[31] = __byte_perm (w[13], w[12], selector); w[30] = __byte_perm (w[12], w[11], selector); w[29] = __byte_perm (w[11], w[10], selector); w[28] = __byte_perm (w[10], w[ 9], selector); w[27] = __byte_perm (w[ 9], w[ 8], selector); w[26] = __byte_perm (w[ 8], w[ 7], selector); w[25] = __byte_perm (w[ 7], w[ 6], selector); w[24] = __byte_perm (w[ 6], w[ 5], selector); w[23] = __byte_perm (w[ 5], w[ 4], selector); w[22] = __byte_perm (w[ 4], w[ 3], selector); w[21] = __byte_perm (w[ 3], w[ 2], selector); w[20] = __byte_perm (w[ 2], w[ 1], selector); w[19] = __byte_perm (w[ 1], w[ 0], selector); w[18] = __byte_perm (w[ 0], 0, selector); w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 19: w[63] = __byte_perm (w[44], w[43], selector); w[62] = __byte_perm (w[43], w[42], selector); w[61] = __byte_perm (w[42], w[41], selector); w[60] = __byte_perm (w[41], w[40], selector); w[59] = __byte_perm (w[40], w[39], selector); w[58] = __byte_perm (w[39], w[38], selector); w[57] = __byte_perm (w[38], w[37], selector); w[56] = __byte_perm (w[37], w[36], selector); w[55] = __byte_perm (w[36], w[35], selector); w[54] = __byte_perm (w[35], w[34], selector); w[53] = __byte_perm (w[34], w[33], selector); w[52] = __byte_perm (w[33], w[32], selector); w[51] = __byte_perm (w[32], w[31], selector); w[50] = __byte_perm (w[31], w[30], selector); w[49] = __byte_perm (w[30], w[29], selector); w[48] = __byte_perm (w[29], w[28], selector); w[47] = __byte_perm (w[28], w[27], selector); w[46] = __byte_perm (w[27], w[26], selector); w[45] = __byte_perm (w[26], w[25], selector); w[44] = __byte_perm (w[25], w[24], selector); w[43] = __byte_perm (w[24], w[23], selector); w[42] = __byte_perm (w[23], w[22], selector); w[41] = __byte_perm (w[22], w[21], selector); w[40] = __byte_perm (w[21], w[20], selector); w[39] = __byte_perm (w[20], w[19], selector); w[38] = __byte_perm (w[19], w[18], selector); w[37] = __byte_perm (w[18], w[17], selector); w[36] = __byte_perm (w[17], w[16], selector); w[35] = __byte_perm (w[16], w[15], selector); w[34] = __byte_perm (w[15], w[14], selector); w[33] = __byte_perm (w[14], w[13], selector); w[32] = __byte_perm (w[13], w[12], selector); w[31] = __byte_perm (w[12], w[11], selector); w[30] = __byte_perm (w[11], w[10], selector); w[29] = __byte_perm (w[10], w[ 9], selector); w[28] = __byte_perm (w[ 9], w[ 8], selector); w[27] = __byte_perm (w[ 8], w[ 7], selector); w[26] = __byte_perm (w[ 7], w[ 6], selector); w[25] = __byte_perm (w[ 6], w[ 5], selector); w[24] = __byte_perm (w[ 5], w[ 4], selector); w[23] = __byte_perm (w[ 4], w[ 3], selector); w[22] = __byte_perm (w[ 3], w[ 2], selector); w[21] = __byte_perm (w[ 2], w[ 1], selector); w[20] = __byte_perm (w[ 1], w[ 0], selector); w[19] = __byte_perm (w[ 0], 0, selector); w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 20: w[63] = __byte_perm (w[43], w[42], selector); w[62] = __byte_perm (w[42], w[41], selector); w[61] = __byte_perm (w[41], w[40], selector); w[60] = __byte_perm (w[40], w[39], selector); w[59] = __byte_perm (w[39], w[38], selector); w[58] = __byte_perm (w[38], w[37], selector); w[57] = __byte_perm (w[37], w[36], selector); w[56] = __byte_perm (w[36], w[35], selector); w[55] = __byte_perm (w[35], w[34], selector); w[54] = __byte_perm (w[34], w[33], selector); w[53] = __byte_perm (w[33], w[32], selector); w[52] = __byte_perm (w[32], w[31], selector); w[51] = __byte_perm (w[31], w[30], selector); w[50] = __byte_perm (w[30], w[29], selector); w[49] = __byte_perm (w[29], w[28], selector); w[48] = __byte_perm (w[28], w[27], selector); w[47] = __byte_perm (w[27], w[26], selector); w[46] = __byte_perm (w[26], w[25], selector); w[45] = __byte_perm (w[25], w[24], selector); w[44] = __byte_perm (w[24], w[23], selector); w[43] = __byte_perm (w[23], w[22], selector); w[42] = __byte_perm (w[22], w[21], selector); w[41] = __byte_perm (w[21], w[20], selector); w[40] = __byte_perm (w[20], w[19], selector); w[39] = __byte_perm (w[19], w[18], selector); w[38] = __byte_perm (w[18], w[17], selector); w[37] = __byte_perm (w[17], w[16], selector); w[36] = __byte_perm (w[16], w[15], selector); w[35] = __byte_perm (w[15], w[14], selector); w[34] = __byte_perm (w[14], w[13], selector); w[33] = __byte_perm (w[13], w[12], selector); w[32] = __byte_perm (w[12], w[11], selector); w[31] = __byte_perm (w[11], w[10], selector); w[30] = __byte_perm (w[10], w[ 9], selector); w[29] = __byte_perm (w[ 9], w[ 8], selector); w[28] = __byte_perm (w[ 8], w[ 7], selector); w[27] = __byte_perm (w[ 7], w[ 6], selector); w[26] = __byte_perm (w[ 6], w[ 5], selector); w[25] = __byte_perm (w[ 5], w[ 4], selector); w[24] = __byte_perm (w[ 4], w[ 3], selector); w[23] = __byte_perm (w[ 3], w[ 2], selector); w[22] = __byte_perm (w[ 2], w[ 1], selector); w[21] = __byte_perm (w[ 1], w[ 0], selector); w[20] = __byte_perm (w[ 0], 0, selector); w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 21: w[63] = __byte_perm (w[42], w[41], selector); w[62] = __byte_perm (w[41], w[40], selector); w[61] = __byte_perm (w[40], w[39], selector); w[60] = __byte_perm (w[39], w[38], selector); w[59] = __byte_perm (w[38], w[37], selector); w[58] = __byte_perm (w[37], w[36], selector); w[57] = __byte_perm (w[36], w[35], selector); w[56] = __byte_perm (w[35], w[34], selector); w[55] = __byte_perm (w[34], w[33], selector); w[54] = __byte_perm (w[33], w[32], selector); w[53] = __byte_perm (w[32], w[31], selector); w[52] = __byte_perm (w[31], w[30], selector); w[51] = __byte_perm (w[30], w[29], selector); w[50] = __byte_perm (w[29], w[28], selector); w[49] = __byte_perm (w[28], w[27], selector); w[48] = __byte_perm (w[27], w[26], selector); w[47] = __byte_perm (w[26], w[25], selector); w[46] = __byte_perm (w[25], w[24], selector); w[45] = __byte_perm (w[24], w[23], selector); w[44] = __byte_perm (w[23], w[22], selector); w[43] = __byte_perm (w[22], w[21], selector); w[42] = __byte_perm (w[21], w[20], selector); w[41] = __byte_perm (w[20], w[19], selector); w[40] = __byte_perm (w[19], w[18], selector); w[39] = __byte_perm (w[18], w[17], selector); w[38] = __byte_perm (w[17], w[16], selector); w[37] = __byte_perm (w[16], w[15], selector); w[36] = __byte_perm (w[15], w[14], selector); w[35] = __byte_perm (w[14], w[13], selector); w[34] = __byte_perm (w[13], w[12], selector); w[33] = __byte_perm (w[12], w[11], selector); w[32] = __byte_perm (w[11], w[10], selector); w[31] = __byte_perm (w[10], w[ 9], selector); w[30] = __byte_perm (w[ 9], w[ 8], selector); w[29] = __byte_perm (w[ 8], w[ 7], selector); w[28] = __byte_perm (w[ 7], w[ 6], selector); w[27] = __byte_perm (w[ 6], w[ 5], selector); w[26] = __byte_perm (w[ 5], w[ 4], selector); w[25] = __byte_perm (w[ 4], w[ 3], selector); w[24] = __byte_perm (w[ 3], w[ 2], selector); w[23] = __byte_perm (w[ 2], w[ 1], selector); w[22] = __byte_perm (w[ 1], w[ 0], selector); w[21] = __byte_perm (w[ 0], 0, selector); w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 22: w[63] = __byte_perm (w[41], w[40], selector); w[62] = __byte_perm (w[40], w[39], selector); w[61] = __byte_perm (w[39], w[38], selector); w[60] = __byte_perm (w[38], w[37], selector); w[59] = __byte_perm (w[37], w[36], selector); w[58] = __byte_perm (w[36], w[35], selector); w[57] = __byte_perm (w[35], w[34], selector); w[56] = __byte_perm (w[34], w[33], selector); w[55] = __byte_perm (w[33], w[32], selector); w[54] = __byte_perm (w[32], w[31], selector); w[53] = __byte_perm (w[31], w[30], selector); w[52] = __byte_perm (w[30], w[29], selector); w[51] = __byte_perm (w[29], w[28], selector); w[50] = __byte_perm (w[28], w[27], selector); w[49] = __byte_perm (w[27], w[26], selector); w[48] = __byte_perm (w[26], w[25], selector); w[47] = __byte_perm (w[25], w[24], selector); w[46] = __byte_perm (w[24], w[23], selector); w[45] = __byte_perm (w[23], w[22], selector); w[44] = __byte_perm (w[22], w[21], selector); w[43] = __byte_perm (w[21], w[20], selector); w[42] = __byte_perm (w[20], w[19], selector); w[41] = __byte_perm (w[19], w[18], selector); w[40] = __byte_perm (w[18], w[17], selector); w[39] = __byte_perm (w[17], w[16], selector); w[38] = __byte_perm (w[16], w[15], selector); w[37] = __byte_perm (w[15], w[14], selector); w[36] = __byte_perm (w[14], w[13], selector); w[35] = __byte_perm (w[13], w[12], selector); w[34] = __byte_perm (w[12], w[11], selector); w[33] = __byte_perm (w[11], w[10], selector); w[32] = __byte_perm (w[10], w[ 9], selector); w[31] = __byte_perm (w[ 9], w[ 8], selector); w[30] = __byte_perm (w[ 8], w[ 7], selector); w[29] = __byte_perm (w[ 7], w[ 6], selector); w[28] = __byte_perm (w[ 6], w[ 5], selector); w[27] = __byte_perm (w[ 5], w[ 4], selector); w[26] = __byte_perm (w[ 4], w[ 3], selector); w[25] = __byte_perm (w[ 3], w[ 2], selector); w[24] = __byte_perm (w[ 2], w[ 1], selector); w[23] = __byte_perm (w[ 1], w[ 0], selector); w[22] = __byte_perm (w[ 0], 0, selector); w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 23: w[63] = __byte_perm (w[40], w[39], selector); w[62] = __byte_perm (w[39], w[38], selector); w[61] = __byte_perm (w[38], w[37], selector); w[60] = __byte_perm (w[37], w[36], selector); w[59] = __byte_perm (w[36], w[35], selector); w[58] = __byte_perm (w[35], w[34], selector); w[57] = __byte_perm (w[34], w[33], selector); w[56] = __byte_perm (w[33], w[32], selector); w[55] = __byte_perm (w[32], w[31], selector); w[54] = __byte_perm (w[31], w[30], selector); w[53] = __byte_perm (w[30], w[29], selector); w[52] = __byte_perm (w[29], w[28], selector); w[51] = __byte_perm (w[28], w[27], selector); w[50] = __byte_perm (w[27], w[26], selector); w[49] = __byte_perm (w[26], w[25], selector); w[48] = __byte_perm (w[25], w[24], selector); w[47] = __byte_perm (w[24], w[23], selector); w[46] = __byte_perm (w[23], w[22], selector); w[45] = __byte_perm (w[22], w[21], selector); w[44] = __byte_perm (w[21], w[20], selector); w[43] = __byte_perm (w[20], w[19], selector); w[42] = __byte_perm (w[19], w[18], selector); w[41] = __byte_perm (w[18], w[17], selector); w[40] = __byte_perm (w[17], w[16], selector); w[39] = __byte_perm (w[16], w[15], selector); w[38] = __byte_perm (w[15], w[14], selector); w[37] = __byte_perm (w[14], w[13], selector); w[36] = __byte_perm (w[13], w[12], selector); w[35] = __byte_perm (w[12], w[11], selector); w[34] = __byte_perm (w[11], w[10], selector); w[33] = __byte_perm (w[10], w[ 9], selector); w[32] = __byte_perm (w[ 9], w[ 8], selector); w[31] = __byte_perm (w[ 8], w[ 7], selector); w[30] = __byte_perm (w[ 7], w[ 6], selector); w[29] = __byte_perm (w[ 6], w[ 5], selector); w[28] = __byte_perm (w[ 5], w[ 4], selector); w[27] = __byte_perm (w[ 4], w[ 3], selector); w[26] = __byte_perm (w[ 3], w[ 2], selector); w[25] = __byte_perm (w[ 2], w[ 1], selector); w[24] = __byte_perm (w[ 1], w[ 0], selector); w[23] = __byte_perm (w[ 0], 0, selector); w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 24: w[63] = __byte_perm (w[39], w[38], selector); w[62] = __byte_perm (w[38], w[37], selector); w[61] = __byte_perm (w[37], w[36], selector); w[60] = __byte_perm (w[36], w[35], selector); w[59] = __byte_perm (w[35], w[34], selector); w[58] = __byte_perm (w[34], w[33], selector); w[57] = __byte_perm (w[33], w[32], selector); w[56] = __byte_perm (w[32], w[31], selector); w[55] = __byte_perm (w[31], w[30], selector); w[54] = __byte_perm (w[30], w[29], selector); w[53] = __byte_perm (w[29], w[28], selector); w[52] = __byte_perm (w[28], w[27], selector); w[51] = __byte_perm (w[27], w[26], selector); w[50] = __byte_perm (w[26], w[25], selector); w[49] = __byte_perm (w[25], w[24], selector); w[48] = __byte_perm (w[24], w[23], selector); w[47] = __byte_perm (w[23], w[22], selector); w[46] = __byte_perm (w[22], w[21], selector); w[45] = __byte_perm (w[21], w[20], selector); w[44] = __byte_perm (w[20], w[19], selector); w[43] = __byte_perm (w[19], w[18], selector); w[42] = __byte_perm (w[18], w[17], selector); w[41] = __byte_perm (w[17], w[16], selector); w[40] = __byte_perm (w[16], w[15], selector); w[39] = __byte_perm (w[15], w[14], selector); w[38] = __byte_perm (w[14], w[13], selector); w[37] = __byte_perm (w[13], w[12], selector); w[36] = __byte_perm (w[12], w[11], selector); w[35] = __byte_perm (w[11], w[10], selector); w[34] = __byte_perm (w[10], w[ 9], selector); w[33] = __byte_perm (w[ 9], w[ 8], selector); w[32] = __byte_perm (w[ 8], w[ 7], selector); w[31] = __byte_perm (w[ 7], w[ 6], selector); w[30] = __byte_perm (w[ 6], w[ 5], selector); w[29] = __byte_perm (w[ 5], w[ 4], selector); w[28] = __byte_perm (w[ 4], w[ 3], selector); w[27] = __byte_perm (w[ 3], w[ 2], selector); w[26] = __byte_perm (w[ 2], w[ 1], selector); w[25] = __byte_perm (w[ 1], w[ 0], selector); w[24] = __byte_perm (w[ 0], 0, selector); w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 25: w[63] = __byte_perm (w[38], w[37], selector); w[62] = __byte_perm (w[37], w[36], selector); w[61] = __byte_perm (w[36], w[35], selector); w[60] = __byte_perm (w[35], w[34], selector); w[59] = __byte_perm (w[34], w[33], selector); w[58] = __byte_perm (w[33], w[32], selector); w[57] = __byte_perm (w[32], w[31], selector); w[56] = __byte_perm (w[31], w[30], selector); w[55] = __byte_perm (w[30], w[29], selector); w[54] = __byte_perm (w[29], w[28], selector); w[53] = __byte_perm (w[28], w[27], selector); w[52] = __byte_perm (w[27], w[26], selector); w[51] = __byte_perm (w[26], w[25], selector); w[50] = __byte_perm (w[25], w[24], selector); w[49] = __byte_perm (w[24], w[23], selector); w[48] = __byte_perm (w[23], w[22], selector); w[47] = __byte_perm (w[22], w[21], selector); w[46] = __byte_perm (w[21], w[20], selector); w[45] = __byte_perm (w[20], w[19], selector); w[44] = __byte_perm (w[19], w[18], selector); w[43] = __byte_perm (w[18], w[17], selector); w[42] = __byte_perm (w[17], w[16], selector); w[41] = __byte_perm (w[16], w[15], selector); w[40] = __byte_perm (w[15], w[14], selector); w[39] = __byte_perm (w[14], w[13], selector); w[38] = __byte_perm (w[13], w[12], selector); w[37] = __byte_perm (w[12], w[11], selector); w[36] = __byte_perm (w[11], w[10], selector); w[35] = __byte_perm (w[10], w[ 9], selector); w[34] = __byte_perm (w[ 9], w[ 8], selector); w[33] = __byte_perm (w[ 8], w[ 7], selector); w[32] = __byte_perm (w[ 7], w[ 6], selector); w[31] = __byte_perm (w[ 6], w[ 5], selector); w[30] = __byte_perm (w[ 5], w[ 4], selector); w[29] = __byte_perm (w[ 4], w[ 3], selector); w[28] = __byte_perm (w[ 3], w[ 2], selector); w[27] = __byte_perm (w[ 2], w[ 1], selector); w[26] = __byte_perm (w[ 1], w[ 0], selector); w[25] = __byte_perm (w[ 0], 0, selector); w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 26: w[63] = __byte_perm (w[37], w[36], selector); w[62] = __byte_perm (w[36], w[35], selector); w[61] = __byte_perm (w[35], w[34], selector); w[60] = __byte_perm (w[34], w[33], selector); w[59] = __byte_perm (w[33], w[32], selector); w[58] = __byte_perm (w[32], w[31], selector); w[57] = __byte_perm (w[31], w[30], selector); w[56] = __byte_perm (w[30], w[29], selector); w[55] = __byte_perm (w[29], w[28], selector); w[54] = __byte_perm (w[28], w[27], selector); w[53] = __byte_perm (w[27], w[26], selector); w[52] = __byte_perm (w[26], w[25], selector); w[51] = __byte_perm (w[25], w[24], selector); w[50] = __byte_perm (w[24], w[23], selector); w[49] = __byte_perm (w[23], w[22], selector); w[48] = __byte_perm (w[22], w[21], selector); w[47] = __byte_perm (w[21], w[20], selector); w[46] = __byte_perm (w[20], w[19], selector); w[45] = __byte_perm (w[19], w[18], selector); w[44] = __byte_perm (w[18], w[17], selector); w[43] = __byte_perm (w[17], w[16], selector); w[42] = __byte_perm (w[16], w[15], selector); w[41] = __byte_perm (w[15], w[14], selector); w[40] = __byte_perm (w[14], w[13], selector); w[39] = __byte_perm (w[13], w[12], selector); w[38] = __byte_perm (w[12], w[11], selector); w[37] = __byte_perm (w[11], w[10], selector); w[36] = __byte_perm (w[10], w[ 9], selector); w[35] = __byte_perm (w[ 9], w[ 8], selector); w[34] = __byte_perm (w[ 8], w[ 7], selector); w[33] = __byte_perm (w[ 7], w[ 6], selector); w[32] = __byte_perm (w[ 6], w[ 5], selector); w[31] = __byte_perm (w[ 5], w[ 4], selector); w[30] = __byte_perm (w[ 4], w[ 3], selector); w[29] = __byte_perm (w[ 3], w[ 2], selector); w[28] = __byte_perm (w[ 2], w[ 1], selector); w[27] = __byte_perm (w[ 1], w[ 0], selector); w[26] = __byte_perm (w[ 0], 0, selector); w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 27: w[63] = __byte_perm (w[36], w[35], selector); w[62] = __byte_perm (w[35], w[34], selector); w[61] = __byte_perm (w[34], w[33], selector); w[60] = __byte_perm (w[33], w[32], selector); w[59] = __byte_perm (w[32], w[31], selector); w[58] = __byte_perm (w[31], w[30], selector); w[57] = __byte_perm (w[30], w[29], selector); w[56] = __byte_perm (w[29], w[28], selector); w[55] = __byte_perm (w[28], w[27], selector); w[54] = __byte_perm (w[27], w[26], selector); w[53] = __byte_perm (w[26], w[25], selector); w[52] = __byte_perm (w[25], w[24], selector); w[51] = __byte_perm (w[24], w[23], selector); w[50] = __byte_perm (w[23], w[22], selector); w[49] = __byte_perm (w[22], w[21], selector); w[48] = __byte_perm (w[21], w[20], selector); w[47] = __byte_perm (w[20], w[19], selector); w[46] = __byte_perm (w[19], w[18], selector); w[45] = __byte_perm (w[18], w[17], selector); w[44] = __byte_perm (w[17], w[16], selector); w[43] = __byte_perm (w[16], w[15], selector); w[42] = __byte_perm (w[15], w[14], selector); w[41] = __byte_perm (w[14], w[13], selector); w[40] = __byte_perm (w[13], w[12], selector); w[39] = __byte_perm (w[12], w[11], selector); w[38] = __byte_perm (w[11], w[10], selector); w[37] = __byte_perm (w[10], w[ 9], selector); w[36] = __byte_perm (w[ 9], w[ 8], selector); w[35] = __byte_perm (w[ 8], w[ 7], selector); w[34] = __byte_perm (w[ 7], w[ 6], selector); w[33] = __byte_perm (w[ 6], w[ 5], selector); w[32] = __byte_perm (w[ 5], w[ 4], selector); w[31] = __byte_perm (w[ 4], w[ 3], selector); w[30] = __byte_perm (w[ 3], w[ 2], selector); w[29] = __byte_perm (w[ 2], w[ 1], selector); w[28] = __byte_perm (w[ 1], w[ 0], selector); w[27] = __byte_perm (w[ 0], 0, selector); w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 28: w[63] = __byte_perm (w[35], w[34], selector); w[62] = __byte_perm (w[34], w[33], selector); w[61] = __byte_perm (w[33], w[32], selector); w[60] = __byte_perm (w[32], w[31], selector); w[59] = __byte_perm (w[31], w[30], selector); w[58] = __byte_perm (w[30], w[29], selector); w[57] = __byte_perm (w[29], w[28], selector); w[56] = __byte_perm (w[28], w[27], selector); w[55] = __byte_perm (w[27], w[26], selector); w[54] = __byte_perm (w[26], w[25], selector); w[53] = __byte_perm (w[25], w[24], selector); w[52] = __byte_perm (w[24], w[23], selector); w[51] = __byte_perm (w[23], w[22], selector); w[50] = __byte_perm (w[22], w[21], selector); w[49] = __byte_perm (w[21], w[20], selector); w[48] = __byte_perm (w[20], w[19], selector); w[47] = __byte_perm (w[19], w[18], selector); w[46] = __byte_perm (w[18], w[17], selector); w[45] = __byte_perm (w[17], w[16], selector); w[44] = __byte_perm (w[16], w[15], selector); w[43] = __byte_perm (w[15], w[14], selector); w[42] = __byte_perm (w[14], w[13], selector); w[41] = __byte_perm (w[13], w[12], selector); w[40] = __byte_perm (w[12], w[11], selector); w[39] = __byte_perm (w[11], w[10], selector); w[38] = __byte_perm (w[10], w[ 9], selector); w[37] = __byte_perm (w[ 9], w[ 8], selector); w[36] = __byte_perm (w[ 8], w[ 7], selector); w[35] = __byte_perm (w[ 7], w[ 6], selector); w[34] = __byte_perm (w[ 6], w[ 5], selector); w[33] = __byte_perm (w[ 5], w[ 4], selector); w[32] = __byte_perm (w[ 4], w[ 3], selector); w[31] = __byte_perm (w[ 3], w[ 2], selector); w[30] = __byte_perm (w[ 2], w[ 1], selector); w[29] = __byte_perm (w[ 1], w[ 0], selector); w[28] = __byte_perm (w[ 0], 0, selector); w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 29: w[63] = __byte_perm (w[34], w[33], selector); w[62] = __byte_perm (w[33], w[32], selector); w[61] = __byte_perm (w[32], w[31], selector); w[60] = __byte_perm (w[31], w[30], selector); w[59] = __byte_perm (w[30], w[29], selector); w[58] = __byte_perm (w[29], w[28], selector); w[57] = __byte_perm (w[28], w[27], selector); w[56] = __byte_perm (w[27], w[26], selector); w[55] = __byte_perm (w[26], w[25], selector); w[54] = __byte_perm (w[25], w[24], selector); w[53] = __byte_perm (w[24], w[23], selector); w[52] = __byte_perm (w[23], w[22], selector); w[51] = __byte_perm (w[22], w[21], selector); w[50] = __byte_perm (w[21], w[20], selector); w[49] = __byte_perm (w[20], w[19], selector); w[48] = __byte_perm (w[19], w[18], selector); w[47] = __byte_perm (w[18], w[17], selector); w[46] = __byte_perm (w[17], w[16], selector); w[45] = __byte_perm (w[16], w[15], selector); w[44] = __byte_perm (w[15], w[14], selector); w[43] = __byte_perm (w[14], w[13], selector); w[42] = __byte_perm (w[13], w[12], selector); w[41] = __byte_perm (w[12], w[11], selector); w[40] = __byte_perm (w[11], w[10], selector); w[39] = __byte_perm (w[10], w[ 9], selector); w[38] = __byte_perm (w[ 9], w[ 8], selector); w[37] = __byte_perm (w[ 8], w[ 7], selector); w[36] = __byte_perm (w[ 7], w[ 6], selector); w[35] = __byte_perm (w[ 6], w[ 5], selector); w[34] = __byte_perm (w[ 5], w[ 4], selector); w[33] = __byte_perm (w[ 4], w[ 3], selector); w[32] = __byte_perm (w[ 3], w[ 2], selector); w[31] = __byte_perm (w[ 2], w[ 1], selector); w[30] = __byte_perm (w[ 1], w[ 0], selector); w[29] = __byte_perm (w[ 0], 0, selector); w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 30: w[63] = __byte_perm (w[33], w[32], selector); w[62] = __byte_perm (w[32], w[31], selector); w[61] = __byte_perm (w[31], w[30], selector); w[60] = __byte_perm (w[30], w[29], selector); w[59] = __byte_perm (w[29], w[28], selector); w[58] = __byte_perm (w[28], w[27], selector); w[57] = __byte_perm (w[27], w[26], selector); w[56] = __byte_perm (w[26], w[25], selector); w[55] = __byte_perm (w[25], w[24], selector); w[54] = __byte_perm (w[24], w[23], selector); w[53] = __byte_perm (w[23], w[22], selector); w[52] = __byte_perm (w[22], w[21], selector); w[51] = __byte_perm (w[21], w[20], selector); w[50] = __byte_perm (w[20], w[19], selector); w[49] = __byte_perm (w[19], w[18], selector); w[48] = __byte_perm (w[18], w[17], selector); w[47] = __byte_perm (w[17], w[16], selector); w[46] = __byte_perm (w[16], w[15], selector); w[45] = __byte_perm (w[15], w[14], selector); w[44] = __byte_perm (w[14], w[13], selector); w[43] = __byte_perm (w[13], w[12], selector); w[42] = __byte_perm (w[12], w[11], selector); w[41] = __byte_perm (w[11], w[10], selector); w[40] = __byte_perm (w[10], w[ 9], selector); w[39] = __byte_perm (w[ 9], w[ 8], selector); w[38] = __byte_perm (w[ 8], w[ 7], selector); w[37] = __byte_perm (w[ 7], w[ 6], selector); w[36] = __byte_perm (w[ 6], w[ 5], selector); w[35] = __byte_perm (w[ 5], w[ 4], selector); w[34] = __byte_perm (w[ 4], w[ 3], selector); w[33] = __byte_perm (w[ 3], w[ 2], selector); w[32] = __byte_perm (w[ 2], w[ 1], selector); w[31] = __byte_perm (w[ 1], w[ 0], selector); w[30] = __byte_perm (w[ 0], 0, selector); w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 31: w[63] = __byte_perm (w[32], w[31], selector); w[62] = __byte_perm (w[31], w[30], selector); w[61] = __byte_perm (w[30], w[29], selector); w[60] = __byte_perm (w[29], w[28], selector); w[59] = __byte_perm (w[28], w[27], selector); w[58] = __byte_perm (w[27], w[26], selector); w[57] = __byte_perm (w[26], w[25], selector); w[56] = __byte_perm (w[25], w[24], selector); w[55] = __byte_perm (w[24], w[23], selector); w[54] = __byte_perm (w[23], w[22], selector); w[53] = __byte_perm (w[22], w[21], selector); w[52] = __byte_perm (w[21], w[20], selector); w[51] = __byte_perm (w[20], w[19], selector); w[50] = __byte_perm (w[19], w[18], selector); w[49] = __byte_perm (w[18], w[17], selector); w[48] = __byte_perm (w[17], w[16], selector); w[47] = __byte_perm (w[16], w[15], selector); w[46] = __byte_perm (w[15], w[14], selector); w[45] = __byte_perm (w[14], w[13], selector); w[44] = __byte_perm (w[13], w[12], selector); w[43] = __byte_perm (w[12], w[11], selector); w[42] = __byte_perm (w[11], w[10], selector); w[41] = __byte_perm (w[10], w[ 9], selector); w[40] = __byte_perm (w[ 9], w[ 8], selector); w[39] = __byte_perm (w[ 8], w[ 7], selector); w[38] = __byte_perm (w[ 7], w[ 6], selector); w[37] = __byte_perm (w[ 6], w[ 5], selector); w[36] = __byte_perm (w[ 5], w[ 4], selector); w[35] = __byte_perm (w[ 4], w[ 3], selector); w[34] = __byte_perm (w[ 3], w[ 2], selector); w[33] = __byte_perm (w[ 2], w[ 1], selector); w[32] = __byte_perm (w[ 1], w[ 0], selector); w[31] = __byte_perm (w[ 0], 0, selector); w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 32: w[63] = __byte_perm (w[31], w[30], selector); w[62] = __byte_perm (w[30], w[29], selector); w[61] = __byte_perm (w[29], w[28], selector); w[60] = __byte_perm (w[28], w[27], selector); w[59] = __byte_perm (w[27], w[26], selector); w[58] = __byte_perm (w[26], w[25], selector); w[57] = __byte_perm (w[25], w[24], selector); w[56] = __byte_perm (w[24], w[23], selector); w[55] = __byte_perm (w[23], w[22], selector); w[54] = __byte_perm (w[22], w[21], selector); w[53] = __byte_perm (w[21], w[20], selector); w[52] = __byte_perm (w[20], w[19], selector); w[51] = __byte_perm (w[19], w[18], selector); w[50] = __byte_perm (w[18], w[17], selector); w[49] = __byte_perm (w[17], w[16], selector); w[48] = __byte_perm (w[16], w[15], selector); w[47] = __byte_perm (w[15], w[14], selector); w[46] = __byte_perm (w[14], w[13], selector); w[45] = __byte_perm (w[13], w[12], selector); w[44] = __byte_perm (w[12], w[11], selector); w[43] = __byte_perm (w[11], w[10], selector); w[42] = __byte_perm (w[10], w[ 9], selector); w[41] = __byte_perm (w[ 9], w[ 8], selector); w[40] = __byte_perm (w[ 8], w[ 7], selector); w[39] = __byte_perm (w[ 7], w[ 6], selector); w[38] = __byte_perm (w[ 6], w[ 5], selector); w[37] = __byte_perm (w[ 5], w[ 4], selector); w[36] = __byte_perm (w[ 4], w[ 3], selector); w[35] = __byte_perm (w[ 3], w[ 2], selector); w[34] = __byte_perm (w[ 2], w[ 1], selector); w[33] = __byte_perm (w[ 1], w[ 0], selector); w[32] = __byte_perm (w[ 0], 0, selector); w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 33: w[63] = __byte_perm (w[30], w[29], selector); w[62] = __byte_perm (w[29], w[28], selector); w[61] = __byte_perm (w[28], w[27], selector); w[60] = __byte_perm (w[27], w[26], selector); w[59] = __byte_perm (w[26], w[25], selector); w[58] = __byte_perm (w[25], w[24], selector); w[57] = __byte_perm (w[24], w[23], selector); w[56] = __byte_perm (w[23], w[22], selector); w[55] = __byte_perm (w[22], w[21], selector); w[54] = __byte_perm (w[21], w[20], selector); w[53] = __byte_perm (w[20], w[19], selector); w[52] = __byte_perm (w[19], w[18], selector); w[51] = __byte_perm (w[18], w[17], selector); w[50] = __byte_perm (w[17], w[16], selector); w[49] = __byte_perm (w[16], w[15], selector); w[48] = __byte_perm (w[15], w[14], selector); w[47] = __byte_perm (w[14], w[13], selector); w[46] = __byte_perm (w[13], w[12], selector); w[45] = __byte_perm (w[12], w[11], selector); w[44] = __byte_perm (w[11], w[10], selector); w[43] = __byte_perm (w[10], w[ 9], selector); w[42] = __byte_perm (w[ 9], w[ 8], selector); w[41] = __byte_perm (w[ 8], w[ 7], selector); w[40] = __byte_perm (w[ 7], w[ 6], selector); w[39] = __byte_perm (w[ 6], w[ 5], selector); w[38] = __byte_perm (w[ 5], w[ 4], selector); w[37] = __byte_perm (w[ 4], w[ 3], selector); w[36] = __byte_perm (w[ 3], w[ 2], selector); w[35] = __byte_perm (w[ 2], w[ 1], selector); w[34] = __byte_perm (w[ 1], w[ 0], selector); w[33] = __byte_perm (w[ 0], 0, selector); w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 34: w[63] = __byte_perm (w[29], w[28], selector); w[62] = __byte_perm (w[28], w[27], selector); w[61] = __byte_perm (w[27], w[26], selector); w[60] = __byte_perm (w[26], w[25], selector); w[59] = __byte_perm (w[25], w[24], selector); w[58] = __byte_perm (w[24], w[23], selector); w[57] = __byte_perm (w[23], w[22], selector); w[56] = __byte_perm (w[22], w[21], selector); w[55] = __byte_perm (w[21], w[20], selector); w[54] = __byte_perm (w[20], w[19], selector); w[53] = __byte_perm (w[19], w[18], selector); w[52] = __byte_perm (w[18], w[17], selector); w[51] = __byte_perm (w[17], w[16], selector); w[50] = __byte_perm (w[16], w[15], selector); w[49] = __byte_perm (w[15], w[14], selector); w[48] = __byte_perm (w[14], w[13], selector); w[47] = __byte_perm (w[13], w[12], selector); w[46] = __byte_perm (w[12], w[11], selector); w[45] = __byte_perm (w[11], w[10], selector); w[44] = __byte_perm (w[10], w[ 9], selector); w[43] = __byte_perm (w[ 9], w[ 8], selector); w[42] = __byte_perm (w[ 8], w[ 7], selector); w[41] = __byte_perm (w[ 7], w[ 6], selector); w[40] = __byte_perm (w[ 6], w[ 5], selector); w[39] = __byte_perm (w[ 5], w[ 4], selector); w[38] = __byte_perm (w[ 4], w[ 3], selector); w[37] = __byte_perm (w[ 3], w[ 2], selector); w[36] = __byte_perm (w[ 2], w[ 1], selector); w[35] = __byte_perm (w[ 1], w[ 0], selector); w[34] = __byte_perm (w[ 0], 0, selector); w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 35: w[63] = __byte_perm (w[28], w[27], selector); w[62] = __byte_perm (w[27], w[26], selector); w[61] = __byte_perm (w[26], w[25], selector); w[60] = __byte_perm (w[25], w[24], selector); w[59] = __byte_perm (w[24], w[23], selector); w[58] = __byte_perm (w[23], w[22], selector); w[57] = __byte_perm (w[22], w[21], selector); w[56] = __byte_perm (w[21], w[20], selector); w[55] = __byte_perm (w[20], w[19], selector); w[54] = __byte_perm (w[19], w[18], selector); w[53] = __byte_perm (w[18], w[17], selector); w[52] = __byte_perm (w[17], w[16], selector); w[51] = __byte_perm (w[16], w[15], selector); w[50] = __byte_perm (w[15], w[14], selector); w[49] = __byte_perm (w[14], w[13], selector); w[48] = __byte_perm (w[13], w[12], selector); w[47] = __byte_perm (w[12], w[11], selector); w[46] = __byte_perm (w[11], w[10], selector); w[45] = __byte_perm (w[10], w[ 9], selector); w[44] = __byte_perm (w[ 9], w[ 8], selector); w[43] = __byte_perm (w[ 8], w[ 7], selector); w[42] = __byte_perm (w[ 7], w[ 6], selector); w[41] = __byte_perm (w[ 6], w[ 5], selector); w[40] = __byte_perm (w[ 5], w[ 4], selector); w[39] = __byte_perm (w[ 4], w[ 3], selector); w[38] = __byte_perm (w[ 3], w[ 2], selector); w[37] = __byte_perm (w[ 2], w[ 1], selector); w[36] = __byte_perm (w[ 1], w[ 0], selector); w[35] = __byte_perm (w[ 0], 0, selector); w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 36: w[63] = __byte_perm (w[27], w[26], selector); w[62] = __byte_perm (w[26], w[25], selector); w[61] = __byte_perm (w[25], w[24], selector); w[60] = __byte_perm (w[24], w[23], selector); w[59] = __byte_perm (w[23], w[22], selector); w[58] = __byte_perm (w[22], w[21], selector); w[57] = __byte_perm (w[21], w[20], selector); w[56] = __byte_perm (w[20], w[19], selector); w[55] = __byte_perm (w[19], w[18], selector); w[54] = __byte_perm (w[18], w[17], selector); w[53] = __byte_perm (w[17], w[16], selector); w[52] = __byte_perm (w[16], w[15], selector); w[51] = __byte_perm (w[15], w[14], selector); w[50] = __byte_perm (w[14], w[13], selector); w[49] = __byte_perm (w[13], w[12], selector); w[48] = __byte_perm (w[12], w[11], selector); w[47] = __byte_perm (w[11], w[10], selector); w[46] = __byte_perm (w[10], w[ 9], selector); w[45] = __byte_perm (w[ 9], w[ 8], selector); w[44] = __byte_perm (w[ 8], w[ 7], selector); w[43] = __byte_perm (w[ 7], w[ 6], selector); w[42] = __byte_perm (w[ 6], w[ 5], selector); w[41] = __byte_perm (w[ 5], w[ 4], selector); w[40] = __byte_perm (w[ 4], w[ 3], selector); w[39] = __byte_perm (w[ 3], w[ 2], selector); w[38] = __byte_perm (w[ 2], w[ 1], selector); w[37] = __byte_perm (w[ 1], w[ 0], selector); w[36] = __byte_perm (w[ 0], 0, selector); w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 37: w[63] = __byte_perm (w[26], w[25], selector); w[62] = __byte_perm (w[25], w[24], selector); w[61] = __byte_perm (w[24], w[23], selector); w[60] = __byte_perm (w[23], w[22], selector); w[59] = __byte_perm (w[22], w[21], selector); w[58] = __byte_perm (w[21], w[20], selector); w[57] = __byte_perm (w[20], w[19], selector); w[56] = __byte_perm (w[19], w[18], selector); w[55] = __byte_perm (w[18], w[17], selector); w[54] = __byte_perm (w[17], w[16], selector); w[53] = __byte_perm (w[16], w[15], selector); w[52] = __byte_perm (w[15], w[14], selector); w[51] = __byte_perm (w[14], w[13], selector); w[50] = __byte_perm (w[13], w[12], selector); w[49] = __byte_perm (w[12], w[11], selector); w[48] = __byte_perm (w[11], w[10], selector); w[47] = __byte_perm (w[10], w[ 9], selector); w[46] = __byte_perm (w[ 9], w[ 8], selector); w[45] = __byte_perm (w[ 8], w[ 7], selector); w[44] = __byte_perm (w[ 7], w[ 6], selector); w[43] = __byte_perm (w[ 6], w[ 5], selector); w[42] = __byte_perm (w[ 5], w[ 4], selector); w[41] = __byte_perm (w[ 4], w[ 3], selector); w[40] = __byte_perm (w[ 3], w[ 2], selector); w[39] = __byte_perm (w[ 2], w[ 1], selector); w[38] = __byte_perm (w[ 1], w[ 0], selector); w[37] = __byte_perm (w[ 0], 0, selector); w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 38: w[63] = __byte_perm (w[25], w[24], selector); w[62] = __byte_perm (w[24], w[23], selector); w[61] = __byte_perm (w[23], w[22], selector); w[60] = __byte_perm (w[22], w[21], selector); w[59] = __byte_perm (w[21], w[20], selector); w[58] = __byte_perm (w[20], w[19], selector); w[57] = __byte_perm (w[19], w[18], selector); w[56] = __byte_perm (w[18], w[17], selector); w[55] = __byte_perm (w[17], w[16], selector); w[54] = __byte_perm (w[16], w[15], selector); w[53] = __byte_perm (w[15], w[14], selector); w[52] = __byte_perm (w[14], w[13], selector); w[51] = __byte_perm (w[13], w[12], selector); w[50] = __byte_perm (w[12], w[11], selector); w[49] = __byte_perm (w[11], w[10], selector); w[48] = __byte_perm (w[10], w[ 9], selector); w[47] = __byte_perm (w[ 9], w[ 8], selector); w[46] = __byte_perm (w[ 8], w[ 7], selector); w[45] = __byte_perm (w[ 7], w[ 6], selector); w[44] = __byte_perm (w[ 6], w[ 5], selector); w[43] = __byte_perm (w[ 5], w[ 4], selector); w[42] = __byte_perm (w[ 4], w[ 3], selector); w[41] = __byte_perm (w[ 3], w[ 2], selector); w[40] = __byte_perm (w[ 2], w[ 1], selector); w[39] = __byte_perm (w[ 1], w[ 0], selector); w[38] = __byte_perm (w[ 0], 0, selector); w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 39: w[63] = __byte_perm (w[24], w[23], selector); w[62] = __byte_perm (w[23], w[22], selector); w[61] = __byte_perm (w[22], w[21], selector); w[60] = __byte_perm (w[21], w[20], selector); w[59] = __byte_perm (w[20], w[19], selector); w[58] = __byte_perm (w[19], w[18], selector); w[57] = __byte_perm (w[18], w[17], selector); w[56] = __byte_perm (w[17], w[16], selector); w[55] = __byte_perm (w[16], w[15], selector); w[54] = __byte_perm (w[15], w[14], selector); w[53] = __byte_perm (w[14], w[13], selector); w[52] = __byte_perm (w[13], w[12], selector); w[51] = __byte_perm (w[12], w[11], selector); w[50] = __byte_perm (w[11], w[10], selector); w[49] = __byte_perm (w[10], w[ 9], selector); w[48] = __byte_perm (w[ 9], w[ 8], selector); w[47] = __byte_perm (w[ 8], w[ 7], selector); w[46] = __byte_perm (w[ 7], w[ 6], selector); w[45] = __byte_perm (w[ 6], w[ 5], selector); w[44] = __byte_perm (w[ 5], w[ 4], selector); w[43] = __byte_perm (w[ 4], w[ 3], selector); w[42] = __byte_perm (w[ 3], w[ 2], selector); w[41] = __byte_perm (w[ 2], w[ 1], selector); w[40] = __byte_perm (w[ 1], w[ 0], selector); w[39] = __byte_perm (w[ 0], 0, selector); w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 40: w[63] = __byte_perm (w[23], w[22], selector); w[62] = __byte_perm (w[22], w[21], selector); w[61] = __byte_perm (w[21], w[20], selector); w[60] = __byte_perm (w[20], w[19], selector); w[59] = __byte_perm (w[19], w[18], selector); w[58] = __byte_perm (w[18], w[17], selector); w[57] = __byte_perm (w[17], w[16], selector); w[56] = __byte_perm (w[16], w[15], selector); w[55] = __byte_perm (w[15], w[14], selector); w[54] = __byte_perm (w[14], w[13], selector); w[53] = __byte_perm (w[13], w[12], selector); w[52] = __byte_perm (w[12], w[11], selector); w[51] = __byte_perm (w[11], w[10], selector); w[50] = __byte_perm (w[10], w[ 9], selector); w[49] = __byte_perm (w[ 9], w[ 8], selector); w[48] = __byte_perm (w[ 8], w[ 7], selector); w[47] = __byte_perm (w[ 7], w[ 6], selector); w[46] = __byte_perm (w[ 6], w[ 5], selector); w[45] = __byte_perm (w[ 5], w[ 4], selector); w[44] = __byte_perm (w[ 4], w[ 3], selector); w[43] = __byte_perm (w[ 3], w[ 2], selector); w[42] = __byte_perm (w[ 2], w[ 1], selector); w[41] = __byte_perm (w[ 1], w[ 0], selector); w[40] = __byte_perm (w[ 0], 0, selector); w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 41: w[63] = __byte_perm (w[22], w[21], selector); w[62] = __byte_perm (w[21], w[20], selector); w[61] = __byte_perm (w[20], w[19], selector); w[60] = __byte_perm (w[19], w[18], selector); w[59] = __byte_perm (w[18], w[17], selector); w[58] = __byte_perm (w[17], w[16], selector); w[57] = __byte_perm (w[16], w[15], selector); w[56] = __byte_perm (w[15], w[14], selector); w[55] = __byte_perm (w[14], w[13], selector); w[54] = __byte_perm (w[13], w[12], selector); w[53] = __byte_perm (w[12], w[11], selector); w[52] = __byte_perm (w[11], w[10], selector); w[51] = __byte_perm (w[10], w[ 9], selector); w[50] = __byte_perm (w[ 9], w[ 8], selector); w[49] = __byte_perm (w[ 8], w[ 7], selector); w[48] = __byte_perm (w[ 7], w[ 6], selector); w[47] = __byte_perm (w[ 6], w[ 5], selector); w[46] = __byte_perm (w[ 5], w[ 4], selector); w[45] = __byte_perm (w[ 4], w[ 3], selector); w[44] = __byte_perm (w[ 3], w[ 2], selector); w[43] = __byte_perm (w[ 2], w[ 1], selector); w[42] = __byte_perm (w[ 1], w[ 0], selector); w[41] = __byte_perm (w[ 0], 0, selector); w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 42: w[63] = __byte_perm (w[21], w[20], selector); w[62] = __byte_perm (w[20], w[19], selector); w[61] = __byte_perm (w[19], w[18], selector); w[60] = __byte_perm (w[18], w[17], selector); w[59] = __byte_perm (w[17], w[16], selector); w[58] = __byte_perm (w[16], w[15], selector); w[57] = __byte_perm (w[15], w[14], selector); w[56] = __byte_perm (w[14], w[13], selector); w[55] = __byte_perm (w[13], w[12], selector); w[54] = __byte_perm (w[12], w[11], selector); w[53] = __byte_perm (w[11], w[10], selector); w[52] = __byte_perm (w[10], w[ 9], selector); w[51] = __byte_perm (w[ 9], w[ 8], selector); w[50] = __byte_perm (w[ 8], w[ 7], selector); w[49] = __byte_perm (w[ 7], w[ 6], selector); w[48] = __byte_perm (w[ 6], w[ 5], selector); w[47] = __byte_perm (w[ 5], w[ 4], selector); w[46] = __byte_perm (w[ 4], w[ 3], selector); w[45] = __byte_perm (w[ 3], w[ 2], selector); w[44] = __byte_perm (w[ 2], w[ 1], selector); w[43] = __byte_perm (w[ 1], w[ 0], selector); w[42] = __byte_perm (w[ 0], 0, selector); w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 43: w[63] = __byte_perm (w[20], w[19], selector); w[62] = __byte_perm (w[19], w[18], selector); w[61] = __byte_perm (w[18], w[17], selector); w[60] = __byte_perm (w[17], w[16], selector); w[59] = __byte_perm (w[16], w[15], selector); w[58] = __byte_perm (w[15], w[14], selector); w[57] = __byte_perm (w[14], w[13], selector); w[56] = __byte_perm (w[13], w[12], selector); w[55] = __byte_perm (w[12], w[11], selector); w[54] = __byte_perm (w[11], w[10], selector); w[53] = __byte_perm (w[10], w[ 9], selector); w[52] = __byte_perm (w[ 9], w[ 8], selector); w[51] = __byte_perm (w[ 8], w[ 7], selector); w[50] = __byte_perm (w[ 7], w[ 6], selector); w[49] = __byte_perm (w[ 6], w[ 5], selector); w[48] = __byte_perm (w[ 5], w[ 4], selector); w[47] = __byte_perm (w[ 4], w[ 3], selector); w[46] = __byte_perm (w[ 3], w[ 2], selector); w[45] = __byte_perm (w[ 2], w[ 1], selector); w[44] = __byte_perm (w[ 1], w[ 0], selector); w[43] = __byte_perm (w[ 0], 0, selector); w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 44: w[63] = __byte_perm (w[19], w[18], selector); w[62] = __byte_perm (w[18], w[17], selector); w[61] = __byte_perm (w[17], w[16], selector); w[60] = __byte_perm (w[16], w[15], selector); w[59] = __byte_perm (w[15], w[14], selector); w[58] = __byte_perm (w[14], w[13], selector); w[57] = __byte_perm (w[13], w[12], selector); w[56] = __byte_perm (w[12], w[11], selector); w[55] = __byte_perm (w[11], w[10], selector); w[54] = __byte_perm (w[10], w[ 9], selector); w[53] = __byte_perm (w[ 9], w[ 8], selector); w[52] = __byte_perm (w[ 8], w[ 7], selector); w[51] = __byte_perm (w[ 7], w[ 6], selector); w[50] = __byte_perm (w[ 6], w[ 5], selector); w[49] = __byte_perm (w[ 5], w[ 4], selector); w[48] = __byte_perm (w[ 4], w[ 3], selector); w[47] = __byte_perm (w[ 3], w[ 2], selector); w[46] = __byte_perm (w[ 2], w[ 1], selector); w[45] = __byte_perm (w[ 1], w[ 0], selector); w[44] = __byte_perm (w[ 0], 0, selector); w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 45: w[63] = __byte_perm (w[18], w[17], selector); w[62] = __byte_perm (w[17], w[16], selector); w[61] = __byte_perm (w[16], w[15], selector); w[60] = __byte_perm (w[15], w[14], selector); w[59] = __byte_perm (w[14], w[13], selector); w[58] = __byte_perm (w[13], w[12], selector); w[57] = __byte_perm (w[12], w[11], selector); w[56] = __byte_perm (w[11], w[10], selector); w[55] = __byte_perm (w[10], w[ 9], selector); w[54] = __byte_perm (w[ 9], w[ 8], selector); w[53] = __byte_perm (w[ 8], w[ 7], selector); w[52] = __byte_perm (w[ 7], w[ 6], selector); w[51] = __byte_perm (w[ 6], w[ 5], selector); w[50] = __byte_perm (w[ 5], w[ 4], selector); w[49] = __byte_perm (w[ 4], w[ 3], selector); w[48] = __byte_perm (w[ 3], w[ 2], selector); w[47] = __byte_perm (w[ 2], w[ 1], selector); w[46] = __byte_perm (w[ 1], w[ 0], selector); w[45] = __byte_perm (w[ 0], 0, selector); w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 46: w[63] = __byte_perm (w[17], w[16], selector); w[62] = __byte_perm (w[16], w[15], selector); w[61] = __byte_perm (w[15], w[14], selector); w[60] = __byte_perm (w[14], w[13], selector); w[59] = __byte_perm (w[13], w[12], selector); w[58] = __byte_perm (w[12], w[11], selector); w[57] = __byte_perm (w[11], w[10], selector); w[56] = __byte_perm (w[10], w[ 9], selector); w[55] = __byte_perm (w[ 9], w[ 8], selector); w[54] = __byte_perm (w[ 8], w[ 7], selector); w[53] = __byte_perm (w[ 7], w[ 6], selector); w[52] = __byte_perm (w[ 6], w[ 5], selector); w[51] = __byte_perm (w[ 5], w[ 4], selector); w[50] = __byte_perm (w[ 4], w[ 3], selector); w[49] = __byte_perm (w[ 3], w[ 2], selector); w[48] = __byte_perm (w[ 2], w[ 1], selector); w[47] = __byte_perm (w[ 1], w[ 0], selector); w[46] = __byte_perm (w[ 0], 0, selector); w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 47: w[63] = __byte_perm (w[16], w[15], selector); w[62] = __byte_perm (w[15], w[14], selector); w[61] = __byte_perm (w[14], w[13], selector); w[60] = __byte_perm (w[13], w[12], selector); w[59] = __byte_perm (w[12], w[11], selector); w[58] = __byte_perm (w[11], w[10], selector); w[57] = __byte_perm (w[10], w[ 9], selector); w[56] = __byte_perm (w[ 9], w[ 8], selector); w[55] = __byte_perm (w[ 8], w[ 7], selector); w[54] = __byte_perm (w[ 7], w[ 6], selector); w[53] = __byte_perm (w[ 6], w[ 5], selector); w[52] = __byte_perm (w[ 5], w[ 4], selector); w[51] = __byte_perm (w[ 4], w[ 3], selector); w[50] = __byte_perm (w[ 3], w[ 2], selector); w[49] = __byte_perm (w[ 2], w[ 1], selector); w[48] = __byte_perm (w[ 1], w[ 0], selector); w[47] = __byte_perm (w[ 0], 0, selector); w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 48: w[63] = __byte_perm (w[15], w[14], selector); w[62] = __byte_perm (w[14], w[13], selector); w[61] = __byte_perm (w[13], w[12], selector); w[60] = __byte_perm (w[12], w[11], selector); w[59] = __byte_perm (w[11], w[10], selector); w[58] = __byte_perm (w[10], w[ 9], selector); w[57] = __byte_perm (w[ 9], w[ 8], selector); w[56] = __byte_perm (w[ 8], w[ 7], selector); w[55] = __byte_perm (w[ 7], w[ 6], selector); w[54] = __byte_perm (w[ 6], w[ 5], selector); w[53] = __byte_perm (w[ 5], w[ 4], selector); w[52] = __byte_perm (w[ 4], w[ 3], selector); w[51] = __byte_perm (w[ 3], w[ 2], selector); w[50] = __byte_perm (w[ 2], w[ 1], selector); w[49] = __byte_perm (w[ 1], w[ 0], selector); w[48] = __byte_perm (w[ 0], 0, selector); w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 49: w[63] = __byte_perm (w[14], w[13], selector); w[62] = __byte_perm (w[13], w[12], selector); w[61] = __byte_perm (w[12], w[11], selector); w[60] = __byte_perm (w[11], w[10], selector); w[59] = __byte_perm (w[10], w[ 9], selector); w[58] = __byte_perm (w[ 9], w[ 8], selector); w[57] = __byte_perm (w[ 8], w[ 7], selector); w[56] = __byte_perm (w[ 7], w[ 6], selector); w[55] = __byte_perm (w[ 6], w[ 5], selector); w[54] = __byte_perm (w[ 5], w[ 4], selector); w[53] = __byte_perm (w[ 4], w[ 3], selector); w[52] = __byte_perm (w[ 3], w[ 2], selector); w[51] = __byte_perm (w[ 2], w[ 1], selector); w[50] = __byte_perm (w[ 1], w[ 0], selector); w[49] = __byte_perm (w[ 0], 0, selector); w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 50: w[63] = __byte_perm (w[13], w[12], selector); w[62] = __byte_perm (w[12], w[11], selector); w[61] = __byte_perm (w[11], w[10], selector); w[60] = __byte_perm (w[10], w[ 9], selector); w[59] = __byte_perm (w[ 9], w[ 8], selector); w[58] = __byte_perm (w[ 8], w[ 7], selector); w[57] = __byte_perm (w[ 7], w[ 6], selector); w[56] = __byte_perm (w[ 6], w[ 5], selector); w[55] = __byte_perm (w[ 5], w[ 4], selector); w[54] = __byte_perm (w[ 4], w[ 3], selector); w[53] = __byte_perm (w[ 3], w[ 2], selector); w[52] = __byte_perm (w[ 2], w[ 1], selector); w[51] = __byte_perm (w[ 1], w[ 0], selector); w[50] = __byte_perm (w[ 0], 0, selector); w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 51: w[63] = __byte_perm (w[12], w[11], selector); w[62] = __byte_perm (w[11], w[10], selector); w[61] = __byte_perm (w[10], w[ 9], selector); w[60] = __byte_perm (w[ 9], w[ 8], selector); w[59] = __byte_perm (w[ 8], w[ 7], selector); w[58] = __byte_perm (w[ 7], w[ 6], selector); w[57] = __byte_perm (w[ 6], w[ 5], selector); w[56] = __byte_perm (w[ 5], w[ 4], selector); w[55] = __byte_perm (w[ 4], w[ 3], selector); w[54] = __byte_perm (w[ 3], w[ 2], selector); w[53] = __byte_perm (w[ 2], w[ 1], selector); w[52] = __byte_perm (w[ 1], w[ 0], selector); w[51] = __byte_perm (w[ 0], 0, selector); w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 52: w[63] = __byte_perm (w[11], w[10], selector); w[62] = __byte_perm (w[10], w[ 9], selector); w[61] = __byte_perm (w[ 9], w[ 8], selector); w[60] = __byte_perm (w[ 8], w[ 7], selector); w[59] = __byte_perm (w[ 7], w[ 6], selector); w[58] = __byte_perm (w[ 6], w[ 5], selector); w[57] = __byte_perm (w[ 5], w[ 4], selector); w[56] = __byte_perm (w[ 4], w[ 3], selector); w[55] = __byte_perm (w[ 3], w[ 2], selector); w[54] = __byte_perm (w[ 2], w[ 1], selector); w[53] = __byte_perm (w[ 1], w[ 0], selector); w[52] = __byte_perm (w[ 0], 0, selector); w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 53: w[63] = __byte_perm (w[10], w[ 9], selector); w[62] = __byte_perm (w[ 9], w[ 8], selector); w[61] = __byte_perm (w[ 8], w[ 7], selector); w[60] = __byte_perm (w[ 7], w[ 6], selector); w[59] = __byte_perm (w[ 6], w[ 5], selector); w[58] = __byte_perm (w[ 5], w[ 4], selector); w[57] = __byte_perm (w[ 4], w[ 3], selector); w[56] = __byte_perm (w[ 3], w[ 2], selector); w[55] = __byte_perm (w[ 2], w[ 1], selector); w[54] = __byte_perm (w[ 1], w[ 0], selector); w[53] = __byte_perm (w[ 0], 0, selector); w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 54: w[63] = __byte_perm (w[ 9], w[ 8], selector); w[62] = __byte_perm (w[ 8], w[ 7], selector); w[61] = __byte_perm (w[ 7], w[ 6], selector); w[60] = __byte_perm (w[ 6], w[ 5], selector); w[59] = __byte_perm (w[ 5], w[ 4], selector); w[58] = __byte_perm (w[ 4], w[ 3], selector); w[57] = __byte_perm (w[ 3], w[ 2], selector); w[56] = __byte_perm (w[ 2], w[ 1], selector); w[55] = __byte_perm (w[ 1], w[ 0], selector); w[54] = __byte_perm (w[ 0], 0, selector); w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 55: w[63] = __byte_perm (w[ 8], w[ 7], selector); w[62] = __byte_perm (w[ 7], w[ 6], selector); w[61] = __byte_perm (w[ 6], w[ 5], selector); w[60] = __byte_perm (w[ 5], w[ 4], selector); w[59] = __byte_perm (w[ 4], w[ 3], selector); w[58] = __byte_perm (w[ 3], w[ 2], selector); w[57] = __byte_perm (w[ 2], w[ 1], selector); w[56] = __byte_perm (w[ 1], w[ 0], selector); w[55] = __byte_perm (w[ 0], 0, selector); w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 56: w[63] = __byte_perm (w[ 7], w[ 6], selector); w[62] = __byte_perm (w[ 6], w[ 5], selector); w[61] = __byte_perm (w[ 5], w[ 4], selector); w[60] = __byte_perm (w[ 4], w[ 3], selector); w[59] = __byte_perm (w[ 3], w[ 2], selector); w[58] = __byte_perm (w[ 2], w[ 1], selector); w[57] = __byte_perm (w[ 1], w[ 0], selector); w[56] = __byte_perm (w[ 0], 0, selector); w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 57: w[63] = __byte_perm (w[ 6], w[ 5], selector); w[62] = __byte_perm (w[ 5], w[ 4], selector); w[61] = __byte_perm (w[ 4], w[ 3], selector); w[60] = __byte_perm (w[ 3], w[ 2], selector); w[59] = __byte_perm (w[ 2], w[ 1], selector); w[58] = __byte_perm (w[ 1], w[ 0], selector); w[57] = __byte_perm (w[ 0], 0, selector); w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 58: w[63] = __byte_perm (w[ 5], w[ 4], selector); w[62] = __byte_perm (w[ 4], w[ 3], selector); w[61] = __byte_perm (w[ 3], w[ 2], selector); w[60] = __byte_perm (w[ 2], w[ 1], selector); w[59] = __byte_perm (w[ 1], w[ 0], selector); w[58] = __byte_perm (w[ 0], 0, selector); w[57] = 0; w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 59: w[63] = __byte_perm (w[ 4], w[ 3], selector); w[62] = __byte_perm (w[ 3], w[ 2], selector); w[61] = __byte_perm (w[ 2], w[ 1], selector); w[60] = __byte_perm (w[ 1], w[ 0], selector); w[59] = __byte_perm (w[ 0], 0, selector); w[58] = 0; w[57] = 0; w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 60: w[63] = __byte_perm (w[ 3], w[ 2], selector); w[62] = __byte_perm (w[ 2], w[ 1], selector); w[61] = __byte_perm (w[ 1], w[ 0], selector); w[60] = __byte_perm (w[ 0], 0, selector); w[59] = 0; w[58] = 0; w[57] = 0; w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 61: w[63] = __byte_perm (w[ 2], w[ 1], selector); w[62] = __byte_perm (w[ 1], w[ 0], selector); w[61] = __byte_perm (w[ 0], 0, selector); w[60] = 0; w[59] = 0; w[58] = 0; w[57] = 0; w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 62: w[63] = __byte_perm (w[ 1], w[ 0], selector); w[62] = __byte_perm (w[ 0], 0, selector); w[61] = 0; w[60] = 0; w[59] = 0; w[58] = 0; w[57] = 0; w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 63: w[63] = __byte_perm (w[ 0], 0, selector); w[62] = 0; w[61] = 0; w[60] = 0; w[59] = 0; w[58] = 0; w[57] = 0; w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; } #endif } /** * vector functions as scalar (for outer loop usage) */ DECLSPEC void truncate_block_4x4_le_S (u32 w0[4], const u32 len) { switch (len) { case 0: w0[0] = 0; w0[1] = 0; w0[2] = 0; w0[3] = 0; break; case 1: w0[0] &= 0x000000ff; w0[1] = 0; w0[2] = 0; w0[3] = 0; break; case 2: w0[0] &= 0x0000ffff; w0[1] = 0; w0[2] = 0; w0[3] = 0; break; case 3: w0[0] &= 0x00ffffff; w0[1] = 0; w0[2] = 0; w0[3] = 0; break; case 4: w0[1] = 0; w0[2] = 0; w0[3] = 0; break; case 5: w0[1] &= 0x000000ff; w0[2] = 0; w0[3] = 0; break; case 6: w0[1] &= 0x0000ffff; w0[2] = 0; w0[3] = 0; break; case 7: w0[1] &= 0x00ffffff; w0[2] = 0; w0[3] = 0; break; case 8: w0[2] = 0; w0[3] = 0; break; case 9: w0[2] &= 0x000000ff; w0[3] = 0; break; case 10: w0[2] &= 0x0000ffff; w0[3] = 0; break; case 11: w0[2] &= 0x00ffffff; w0[3] = 0; break; case 12: w0[3] = 0; break; case 13: w0[3] &= 0x000000ff; break; case 14: w0[3] &= 0x0000ffff; break; case 15: w0[3] &= 0x00ffffff; break; } } DECLSPEC void truncate_block_4x4_be_S (u32 w0[4], const u32 len) { switch (len) { case 0: w0[0] = 0; w0[1] = 0; w0[2] = 0; w0[3] = 0; break; case 1: w0[0] &= 0xff000000; w0[1] = 0; w0[2] = 0; w0[3] = 0; break; case 2: w0[0] &= 0xffff0000; w0[1] = 0; w0[2] = 0; w0[3] = 0; break; case 3: w0[0] &= 0xffffff00; w0[1] = 0; w0[2] = 0; w0[3] = 0; break; case 4: w0[1] = 0; w0[2] = 0; w0[3] = 0; break; case 5: w0[1] &= 0xff000000; w0[2] = 0; w0[3] = 0; break; case 6: w0[1] &= 0xffff0000; w0[2] = 0; w0[3] = 0; break; case 7: w0[1] &= 0xffffff00; w0[2] = 0; w0[3] = 0; break; case 8: w0[2] = 0; w0[3] = 0; break; case 9: w0[2] &= 0xff000000; w0[3] = 0; break; case 10: w0[2] &= 0xffff0000; w0[3] = 0; break; case 11: w0[2] &= 0xffffff00; w0[3] = 0; break; case 12: w0[3] = 0; break; case 13: w0[3] &= 0xff000000; break; case 14: w0[3] &= 0xffff0000; break; case 15: w0[3] &= 0xffffff00; break; } } DECLSPEC void truncate_block_16x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 len) { switch (len) { case 0: w0[0] = 0; w0[1] = 0; w0[2] = 0; w0[3] = 0; w1[0] = 0; w1[1] = 0; w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 1: w0[0] &= 0x000000ff; w0[1] = 0; w0[2] = 0; w0[3] = 0; w1[0] = 0; w1[1] = 0; w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 2: w0[0] &= 0x0000ffff; w0[1] = 0; w0[2] = 0; w0[3] = 0; w1[0] = 0; w1[1] = 0; w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 3: w0[0] &= 0x00ffffff; w0[1] = 0; w0[2] = 0; w0[3] = 0; w1[0] = 0; w1[1] = 0; w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 4: w0[1] = 0; w0[2] = 0; w0[3] = 0; w1[0] = 0; w1[1] = 0; w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 5: w0[1] &= 0x000000ff; w0[2] = 0; w0[3] = 0; w1[0] = 0; w1[1] = 0; w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 6: w0[1] &= 0x0000ffff; w0[2] = 0; w0[3] = 0; w1[0] = 0; w1[1] = 0; w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 7: w0[1] &= 0x00ffffff; w0[2] = 0; w0[3] = 0; w1[0] = 0; w1[1] = 0; w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 8: w0[2] = 0; w0[3] = 0; w1[0] = 0; w1[1] = 0; w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 9: w0[2] &= 0x000000ff; w0[3] = 0; w1[0] = 0; w1[1] = 0; w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 10: w0[2] &= 0x0000ffff; w0[3] = 0; w1[0] = 0; w1[1] = 0; w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 11: w0[2] &= 0x00ffffff; w0[3] = 0; w1[0] = 0; w1[1] = 0; w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 12: w0[3] = 0; w1[0] = 0; w1[1] = 0; w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 13: w0[3] &= 0x000000ff; w1[0] = 0; w1[1] = 0; w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 14: w0[3] &= 0x0000ffff; w1[0] = 0; w1[1] = 0; w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 15: w0[3] &= 0x00ffffff; w1[0] = 0; w1[1] = 0; w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 16: w1[0] = 0; w1[1] = 0; w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 17: w1[0] &= 0x000000ff; w1[1] = 0; w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 18: w1[0] &= 0x0000ffff; w1[1] = 0; w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 19: w1[0] &= 0x00ffffff; w1[1] = 0; w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 20: w1[1] = 0; w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 21: w1[1] &= 0x000000ff; w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 22: w1[1] &= 0x0000ffff; w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 23: w1[1] &= 0x00ffffff; w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 24: w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 25: w1[2] &= 0x000000ff; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 26: w1[2] &= 0x0000ffff; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 27: w1[2] &= 0x00ffffff; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 28: w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 29: w1[3] &= 0x000000ff; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 30: w1[3] &= 0x0000ffff; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 31: w1[3] &= 0x00ffffff; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 32: w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 33: w2[0] &= 0x000000ff; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 34: w2[0] &= 0x0000ffff; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 35: w2[0] &= 0x00ffffff; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 36: w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 37: w2[1] &= 0x000000ff; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 38: w2[1] &= 0x0000ffff; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 39: w2[1] &= 0x00ffffff; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 40: w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 41: w2[2] &= 0x000000ff; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 42: w2[2] &= 0x0000ffff; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 43: w2[2] &= 0x00ffffff; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 44: w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 45: w2[3] &= 0x000000ff; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 46: w2[3] &= 0x0000ffff; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 47: w2[3] &= 0x00ffffff; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 48: w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 49: w3[0] &= 0x000000ff; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 50: w3[0] &= 0x0000ffff; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 51: w3[0] &= 0x00ffffff; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 52: w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 53: w3[1] &= 0x000000ff; w3[2] = 0; w3[3] = 0; break; case 54: w3[1] &= 0x0000ffff; w3[2] = 0; w3[3] = 0; break; case 55: w3[1] &= 0x00ffffff; w3[2] = 0; w3[3] = 0; break; case 56: w3[2] = 0; w3[3] = 0; break; case 57: w3[2] &= 0x000000ff; w3[3] = 0; break; case 58: w3[2] &= 0x0000ffff; w3[3] = 0; break; case 59: w3[2] &= 0x00ffffff; w3[3] = 0; break; case 60: w3[3] = 0; break; case 61: w3[3] &= 0x000000ff; break; case 62: w3[3] &= 0x0000ffff; break; case 63: w3[3] &= 0x00ffffff; break; } } DECLSPEC void truncate_block_16x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 len) { switch (len) { case 0: w0[0] = 0; w0[1] = 0; w0[2] = 0; w0[3] = 0; w1[0] = 0; w1[1] = 0; w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 1: w0[0] &= 0xff000000; w0[1] = 0; w0[2] = 0; w0[3] = 0; w1[0] = 0; w1[1] = 0; w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 2: w0[0] &= 0xffff0000; w0[1] = 0; w0[2] = 0; w0[3] = 0; w1[0] = 0; w1[1] = 0; w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 3: w0[0] &= 0xffffff00; w0[1] = 0; w0[2] = 0; w0[3] = 0; w1[0] = 0; w1[1] = 0; w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 4: w0[1] = 0; w0[2] = 0; w0[3] = 0; w1[0] = 0; w1[1] = 0; w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 5: w0[1] &= 0xff000000; w0[2] = 0; w0[3] = 0; w1[0] = 0; w1[1] = 0; w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 6: w0[1] &= 0xffff0000; w0[2] = 0; w0[3] = 0; w1[0] = 0; w1[1] = 0; w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 7: w0[1] &= 0xffffff00; w0[2] = 0; w0[3] = 0; w1[0] = 0; w1[1] = 0; w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 8: w0[2] = 0; w0[3] = 0; w1[0] = 0; w1[1] = 0; w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 9: w0[2] &= 0xff000000; w0[3] = 0; w1[0] = 0; w1[1] = 0; w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 10: w0[2] &= 0xffff0000; w0[3] = 0; w1[0] = 0; w1[1] = 0; w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 11: w0[2] &= 0xffffff00; w0[3] = 0; w1[0] = 0; w1[1] = 0; w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 12: w0[3] = 0; w1[0] = 0; w1[1] = 0; w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 13: w0[3] &= 0xff000000; w1[0] = 0; w1[1] = 0; w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 14: w0[3] &= 0xffff0000; w1[0] = 0; w1[1] = 0; w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 15: w0[3] &= 0xffffff00; w1[0] = 0; w1[1] = 0; w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 16: w1[0] = 0; w1[1] = 0; w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 17: w1[0] &= 0xff000000; w1[1] = 0; w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 18: w1[0] &= 0xffff0000; w1[1] = 0; w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 19: w1[0] &= 0xffffff00; w1[1] = 0; w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 20: w1[1] = 0; w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 21: w1[1] &= 0xff000000; w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 22: w1[1] &= 0xffff0000; w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 23: w1[1] &= 0xffffff00; w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 24: w1[2] = 0; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 25: w1[2] &= 0xff000000; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 26: w1[2] &= 0xffff0000; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 27: w1[2] &= 0xffffff00; w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 28: w1[3] = 0; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 29: w1[3] &= 0xff000000; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 30: w1[3] &= 0xffff0000; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 31: w1[3] &= 0xffffff00; w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 32: w2[0] = 0; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 33: w2[0] &= 0xff000000; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 34: w2[0] &= 0xffff0000; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 35: w2[0] &= 0xffffff00; w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 36: w2[1] = 0; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 37: w2[1] &= 0xff000000; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 38: w2[1] &= 0xffff0000; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 39: w2[1] &= 0xffffff00; w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 40: w2[2] = 0; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 41: w2[2] &= 0xff000000; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 42: w2[2] &= 0xffff0000; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 43: w2[2] &= 0xffffff00; w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 44: w2[3] = 0; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 45: w2[3] &= 0xff000000; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 46: w2[3] &= 0xffff0000; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 47: w2[3] &= 0xffffff00; w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 48: w3[0] = 0; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 49: w3[0] &= 0xff000000; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 50: w3[0] &= 0xffff0000; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 51: w3[0] &= 0xffffff00; w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 52: w3[1] = 0; w3[2] = 0; w3[3] = 0; break; case 53: w3[1] &= 0xff000000; w3[2] = 0; w3[3] = 0; break; case 54: w3[1] &= 0xffff0000; w3[2] = 0; w3[3] = 0; break; case 55: w3[1] &= 0xffffff00; w3[2] = 0; w3[3] = 0; break; case 56: w3[2] = 0; w3[3] = 0; break; case 57: w3[2] &= 0xff000000; w3[3] = 0; break; case 58: w3[2] &= 0xffff0000; w3[3] = 0; break; case 59: w3[2] &= 0xffffff00; w3[3] = 0; break; case 60: w3[3] = 0; break; case 61: w3[3] &= 0xff000000; break; case 62: w3[3] &= 0xffff0000; break; case 63: w3[3] &= 0xffffff00; break; } } DECLSPEC void append_helper_1x4_S (u32 r[4], const u32 v, const u32 m[4]) { r[0] |= v & m[0]; r[1] |= v & m[1]; r[2] |= v & m[2]; r[3] |= v & m[3]; } DECLSPEC void append_0x01_2x4_S (u32 w0[4], u32 w1[4], const u32 offset) { const u32 v[4] = { c_append_helper_mini[offset & 0xf][0], c_append_helper_mini[offset & 0xf][1], c_append_helper_mini[offset & 0xf][2], c_append_helper_mini[offset & 0xf][3] }; const u32 offset16 = offset / 16; append_helper_1x4_S (w0, ((offset16 == 0) ? 0x01010101 : 0), v); append_helper_1x4_S (w1, ((offset16 == 1) ? 0x01010101 : 0), v); } DECLSPEC void append_0x80_1x4_S (u32 w0[4], const u32 offset) { const u32 v[4] = { c_append_helper_mini[offset & 0xf][0], c_append_helper_mini[offset & 0xf][1], c_append_helper_mini[offset & 0xf][2], c_append_helper_mini[offset & 0xf][3] }; append_helper_1x4_S (w0, 0x80808080, v); } DECLSPEC void append_0x80_2x4_S (u32 w0[4], u32 w1[4], const u32 offset) { const u32 v[4] = { c_append_helper_mini[offset & 0xf][0], c_append_helper_mini[offset & 0xf][1], c_append_helper_mini[offset & 0xf][2], c_append_helper_mini[offset & 0xf][3] }; const u32 offset16 = offset / 16; append_helper_1x4_S (w0, ((offset16 == 0) ? 0x80808080 : 0), v); append_helper_1x4_S (w1, ((offset16 == 1) ? 0x80808080 : 0), v); } DECLSPEC void append_0x80_3x4_S (u32 w0[4], u32 w1[4], u32 w2[4], const u32 offset) { const u32 v[4] = { c_append_helper_mini[offset & 0xf][0], c_append_helper_mini[offset & 0xf][1], c_append_helper_mini[offset & 0xf][2], c_append_helper_mini[offset & 0xf][3] }; const u32 offset16 = offset / 16; append_helper_1x4_S (w0, ((offset16 == 0) ? 0x80808080 : 0), v); append_helper_1x4_S (w1, ((offset16 == 1) ? 0x80808080 : 0), v); append_helper_1x4_S (w2, ((offset16 == 2) ? 0x80808080 : 0), v); } DECLSPEC void append_0x80_4x4_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) { const u32 v[4] = { c_append_helper_mini[offset & 0xf][0], c_append_helper_mini[offset & 0xf][1], c_append_helper_mini[offset & 0xf][2], c_append_helper_mini[offset & 0xf][3] }; const u32 offset16 = offset / 16; append_helper_1x4_S (w0, ((offset16 == 0) ? 0x80808080 : 0), v); append_helper_1x4_S (w1, ((offset16 == 1) ? 0x80808080 : 0), v); append_helper_1x4_S (w2, ((offset16 == 2) ? 0x80808080 : 0), v); append_helper_1x4_S (w3, ((offset16 == 3) ? 0x80808080 : 0), v); } DECLSPEC void append_0x80_8x4_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset) { const u32 v[4] = { c_append_helper_mini[offset & 0xf][0], c_append_helper_mini[offset & 0xf][1], c_append_helper_mini[offset & 0xf][2], c_append_helper_mini[offset & 0xf][3] }; const u32 offset16 = offset / 16; append_helper_1x4_S (w0, ((offset16 == 0) ? 0x80808080 : 0), v); append_helper_1x4_S (w1, ((offset16 == 1) ? 0x80808080 : 0), v); append_helper_1x4_S (w2, ((offset16 == 2) ? 0x80808080 : 0), v); append_helper_1x4_S (w3, ((offset16 == 3) ? 0x80808080 : 0), v); append_helper_1x4_S (w4, ((offset16 == 4) ? 0x80808080 : 0), v); append_helper_1x4_S (w5, ((offset16 == 5) ? 0x80808080 : 0), v); append_helper_1x4_S (w6, ((offset16 == 6) ? 0x80808080 : 0), v); append_helper_1x4_S (w7, ((offset16 == 7) ? 0x80808080 : 0), v); } DECLSPEC void make_utf16be_S (const u32 in[4], u32 out1[4], u32 out2[4]) { #if defined IS_NV out2[3] = __byte_perm_S (in[3], 0, 0x3727); out2[2] = __byte_perm_S (in[3], 0, 0x1707); out2[1] = __byte_perm_S (in[2], 0, 0x3727); out2[0] = __byte_perm_S (in[2], 0, 0x1707); out1[3] = __byte_perm_S (in[1], 0, 0x3727); out1[2] = __byte_perm_S (in[1], 0, 0x1707); out1[1] = __byte_perm_S (in[0], 0, 0x3727); out1[0] = __byte_perm_S (in[0], 0, 0x1707); #elif defined IS_AMD && AMD_GCN >= 3 out2[3] = __byte_perm_S (in[3], 0, 0x03070207); out2[2] = __byte_perm_S (in[3], 0, 0x01070007); out2[1] = __byte_perm_S (in[2], 0, 0x03070207); out2[0] = __byte_perm_S (in[2], 0, 0x01070007); out1[3] = __byte_perm_S (in[1], 0, 0x03070207); out1[2] = __byte_perm_S (in[1], 0, 0x01070007); out1[1] = __byte_perm_S (in[0], 0, 0x03070207); out1[0] = __byte_perm_S (in[0], 0, 0x01070007); #else out2[3] = ((in[3] >> 0) & 0xFF000000) | ((in[3] >> 8) & 0x0000FF00); out2[2] = ((in[3] << 16) & 0xFF000000) | ((in[3] << 8) & 0x0000FF00); out2[1] = ((in[2] >> 0) & 0xFF000000) | ((in[2] >> 8) & 0x0000FF00); out2[0] = ((in[2] << 16) & 0xFF000000) | ((in[2] << 8) & 0x0000FF00); out1[3] = ((in[1] >> 0) & 0xFF000000) | ((in[1] >> 8) & 0x0000FF00); out1[2] = ((in[1] << 16) & 0xFF000000) | ((in[1] << 8) & 0x0000FF00); out1[1] = ((in[0] >> 0) & 0xFF000000) | ((in[0] >> 8) & 0x0000FF00); out1[0] = ((in[0] << 16) & 0xFF000000) | ((in[0] << 8) & 0x0000FF00); #endif } DECLSPEC void make_utf16le_S (const u32 in[4], u32 out1[4], u32 out2[4]) { #if defined IS_NV out2[3] = __byte_perm_S (in[3], 0, 0x7372); out2[2] = __byte_perm_S (in[3], 0, 0x7170); out2[1] = __byte_perm_S (in[2], 0, 0x7372); out2[0] = __byte_perm_S (in[2], 0, 0x7170); out1[3] = __byte_perm_S (in[1], 0, 0x7372); out1[2] = __byte_perm_S (in[1], 0, 0x7170); out1[1] = __byte_perm_S (in[0], 0, 0x7372); out1[0] = __byte_perm_S (in[0], 0, 0x7170); #elif defined IS_AMD && AMD_GCN >= 3 out2[3] = __byte_perm_S (in[3], 0, 0x07030702); out2[2] = __byte_perm_S (in[3], 0, 0x07010700); out2[1] = __byte_perm_S (in[2], 0, 0x07030702); out2[0] = __byte_perm_S (in[2], 0, 0x07010700); out1[3] = __byte_perm_S (in[1], 0, 0x07030702); out1[2] = __byte_perm_S (in[1], 0, 0x07010700); out1[1] = __byte_perm_S (in[0], 0, 0x07030702); out1[0] = __byte_perm_S (in[0], 0, 0x07010700); #else out2[3] = ((in[3] >> 8) & 0x00FF0000) | ((in[3] >> 16) & 0x000000FF); out2[2] = ((in[3] << 8) & 0x00FF0000) | ((in[3] >> 0) & 0x000000FF); out2[1] = ((in[2] >> 8) & 0x00FF0000) | ((in[2] >> 16) & 0x000000FF); out2[0] = ((in[2] << 8) & 0x00FF0000) | ((in[2] >> 0) & 0x000000FF); out1[3] = ((in[1] >> 8) & 0x00FF0000) | ((in[1] >> 16) & 0x000000FF); out1[2] = ((in[1] << 8) & 0x00FF0000) | ((in[1] >> 0) & 0x000000FF); out1[1] = ((in[0] >> 8) & 0x00FF0000) | ((in[0] >> 16) & 0x000000FF); out1[0] = ((in[0] << 8) & 0x00FF0000) | ((in[0] >> 0) & 0x000000FF); #endif } DECLSPEC void undo_utf16be_S (const u32 in1[4], const u32 in2[4], u32 out[4]) { #if defined IS_NV out[0] = __byte_perm_S (in1[0], in1[1], 0x4602); out[1] = __byte_perm_S (in1[2], in1[3], 0x4602); out[2] = __byte_perm_S (in2[0], in2[1], 0x4602); out[3] = __byte_perm_S (in2[2], in2[3], 0x4602); #elif defined IS_AMD && AMD_GCN >= 3 out[0] = __byte_perm_S (in1[0], in1[1], 0x04060002); out[1] = __byte_perm_S (in1[2], in1[3], 0x04060002); out[2] = __byte_perm_S (in2[0], in2[1], 0x04060002); out[3] = __byte_perm_S (in2[2], in2[3], 0x04060002); #else out[0] = ((in1[0] & 0x0000ff00) >> 8) | ((in1[0] & 0xff000000) >> 16) | ((in1[1] & 0x0000ff00) << 8) | ((in1[1] & 0xff000000) << 0); out[1] = ((in1[2] & 0x0000ff00) >> 8) | ((in1[2] & 0xff000000) >> 16) | ((in1[3] & 0x0000ff00) << 8) | ((in1[3] & 0xff000000) << 0); out[2] = ((in2[0] & 0x0000ff00) >> 8) | ((in2[0] & 0xff000000) >> 16) | ((in2[1] & 0x0000ff00) << 8) | ((in2[1] & 0xff000000) << 0); out[3] = ((in2[2] & 0x0000ff00) >> 8) | ((in2[2] & 0xff000000) >> 16) | ((in2[3] & 0x0000ff00) << 8) | ((in2[3] & 0xff000000) << 0); #endif } DECLSPEC void undo_utf16le_S (const u32 in1[4], const u32 in2[4], u32 out[4]) { #if defined IS_NV out[0] = __byte_perm_S (in1[0], in1[1], 0x6420); out[1] = __byte_perm_S (in1[2], in1[3], 0x6420); out[2] = __byte_perm_S (in2[0], in2[1], 0x6420); out[3] = __byte_perm_S (in2[2], in2[3], 0x6420); #elif defined IS_AMD && AMD_GCN >= 3 out[0] = __byte_perm_S (in1[0], in1[1], 0x06040200); out[1] = __byte_perm_S (in1[2], in1[3], 0x06040200); out[2] = __byte_perm_S (in2[0], in2[1], 0x06040200); out[3] = __byte_perm_S (in2[2], in2[3], 0x06040200); #else out[0] = ((in1[0] & 0x000000ff) >> 0) | ((in1[0] & 0x00ff0000) >> 8) | ((in1[1] & 0x000000ff) << 16) | ((in1[1] & 0x00ff0000) << 8); out[1] = ((in1[2] & 0x000000ff) >> 0) | ((in1[2] & 0x00ff0000) >> 8) | ((in1[3] & 0x000000ff) << 16) | ((in1[3] & 0x00ff0000) << 8); out[2] = ((in2[0] & 0x000000ff) >> 0) | ((in2[0] & 0x00ff0000) >> 8) | ((in2[1] & 0x000000ff) << 16) | ((in2[1] & 0x00ff0000) << 8); out[3] = ((in2[2] & 0x000000ff) >> 0) | ((in2[2] & 0x00ff0000) >> 8) | ((in2[3] & 0x000000ff) << 16) | ((in2[3] & 0x00ff0000) << 8); #endif } DECLSPEC void switch_buffer_by_offset_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) { const int offset_mod_4 = offset & 3; const int offset_minus_4 = 4 - offset_mod_4; const int offset_switch = offset / 4; #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC w0[0] = swap32_S (w0[0]); w0[1] = swap32_S (w0[1]); w0[2] = swap32_S (w0[2]); w0[3] = swap32_S (w0[3]); w1[0] = swap32_S (w1[0]); w1[1] = swap32_S (w1[1]); w1[2] = swap32_S (w1[2]); w1[3] = swap32_S (w1[3]); w2[0] = swap32_S (w2[0]); w2[1] = swap32_S (w2[1]); w2[2] = swap32_S (w2[2]); w2[3] = swap32_S (w2[3]); w3[0] = swap32_S (w3[0]); w3[1] = swap32_S (w3[1]); w3[2] = swap32_S (w3[2]); w3[3] = swap32_S (w3[3]); switch (offset_switch) { case 0: w3[3] = amd_bytealign_S (w3[2], w3[3], offset); w3[2] = amd_bytealign_S (w3[1], w3[2], offset); w3[1] = amd_bytealign_S (w3[0], w3[1], offset); w3[0] = amd_bytealign_S (w2[3], w3[0], offset); w2[3] = amd_bytealign_S (w2[2], w2[3], offset); w2[2] = amd_bytealign_S (w2[1], w2[2], offset); w2[1] = amd_bytealign_S (w2[0], w2[1], offset); w2[0] = amd_bytealign_S (w1[3], w2[0], offset); w1[3] = amd_bytealign_S (w1[2], w1[3], offset); w1[2] = amd_bytealign_S (w1[1], w1[2], offset); w1[1] = amd_bytealign_S (w1[0], w1[1], offset); w1[0] = amd_bytealign_S (w0[3], w1[0], offset); w0[3] = amd_bytealign_S (w0[2], w0[3], offset); w0[2] = amd_bytealign_S (w0[1], w0[2], offset); w0[1] = amd_bytealign_S (w0[0], w0[1], offset); w0[0] = amd_bytealign_S ( 0, w0[0], offset); break; case 1: w3[3] = amd_bytealign_S (w3[1], w3[2], offset); w3[2] = amd_bytealign_S (w3[0], w3[1], offset); w3[1] = amd_bytealign_S (w2[3], w3[0], offset); w3[0] = amd_bytealign_S (w2[2], w2[3], offset); w2[3] = amd_bytealign_S (w2[1], w2[2], offset); w2[2] = amd_bytealign_S (w2[0], w2[1], offset); w2[1] = amd_bytealign_S (w1[3], w2[0], offset); w2[0] = amd_bytealign_S (w1[2], w1[3], offset); w1[3] = amd_bytealign_S (w1[1], w1[2], offset); w1[2] = amd_bytealign_S (w1[0], w1[1], offset); w1[1] = amd_bytealign_S (w0[3], w1[0], offset); w1[0] = amd_bytealign_S (w0[2], w0[3], offset); w0[3] = amd_bytealign_S (w0[1], w0[2], offset); w0[2] = amd_bytealign_S (w0[0], w0[1], offset); w0[1] = amd_bytealign_S ( 0, w0[0], offset); w0[0] = 0; break; case 2: w3[3] = amd_bytealign_S (w3[0], w3[1], offset); w3[2] = amd_bytealign_S (w2[3], w3[0], offset); w3[1] = amd_bytealign_S (w2[2], w2[3], offset); w3[0] = amd_bytealign_S (w2[1], w2[2], offset); w2[3] = amd_bytealign_S (w2[0], w2[1], offset); w2[2] = amd_bytealign_S (w1[3], w2[0], offset); w2[1] = amd_bytealign_S (w1[2], w1[3], offset); w2[0] = amd_bytealign_S (w1[1], w1[2], offset); w1[3] = amd_bytealign_S (w1[0], w1[1], offset); w1[2] = amd_bytealign_S (w0[3], w1[0], offset); w1[1] = amd_bytealign_S (w0[2], w0[3], offset); w1[0] = amd_bytealign_S (w0[1], w0[2], offset); w0[3] = amd_bytealign_S (w0[0], w0[1], offset); w0[2] = amd_bytealign_S ( 0, w0[0], offset); w0[1] = 0; w0[0] = 0; break; case 3: w3[3] = amd_bytealign_S (w2[3], w3[0], offset); w3[2] = amd_bytealign_S (w2[2], w2[3], offset); w3[1] = amd_bytealign_S (w2[1], w2[2], offset); w3[0] = amd_bytealign_S (w2[0], w2[1], offset); w2[3] = amd_bytealign_S (w1[3], w2[0], offset); w2[2] = amd_bytealign_S (w1[2], w1[3], offset); w2[1] = amd_bytealign_S (w1[1], w1[2], offset); w2[0] = amd_bytealign_S (w1[0], w1[1], offset); w1[3] = amd_bytealign_S (w0[3], w1[0], offset); w1[2] = amd_bytealign_S (w0[2], w0[3], offset); w1[1] = amd_bytealign_S (w0[1], w0[2], offset); w1[0] = amd_bytealign_S (w0[0], w0[1], offset); w0[3] = amd_bytealign_S ( 0, w0[0], offset); w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 4: w3[3] = amd_bytealign_S (w2[2], w2[3], offset); w3[2] = amd_bytealign_S (w2[1], w2[2], offset); w3[1] = amd_bytealign_S (w2[0], w2[1], offset); w3[0] = amd_bytealign_S (w1[3], w2[0], offset); w2[3] = amd_bytealign_S (w1[2], w1[3], offset); w2[2] = amd_bytealign_S (w1[1], w1[2], offset); w2[1] = amd_bytealign_S (w1[0], w1[1], offset); w2[0] = amd_bytealign_S (w0[3], w1[0], offset); w1[3] = amd_bytealign_S (w0[2], w0[3], offset); w1[2] = amd_bytealign_S (w0[1], w0[2], offset); w1[1] = amd_bytealign_S (w0[0], w0[1], offset); w1[0] = amd_bytealign_S ( 0, w0[0], offset); w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 5: w3[3] = amd_bytealign_S (w2[1], w2[2], offset); w3[2] = amd_bytealign_S (w2[0], w2[1], offset); w3[1] = amd_bytealign_S (w1[3], w2[0], offset); w3[0] = amd_bytealign_S (w1[2], w1[3], offset); w2[3] = amd_bytealign_S (w1[1], w1[2], offset); w2[2] = amd_bytealign_S (w1[0], w1[1], offset); w2[1] = amd_bytealign_S (w0[3], w1[0], offset); w2[0] = amd_bytealign_S (w0[2], w0[3], offset); w1[3] = amd_bytealign_S (w0[1], w0[2], offset); w1[2] = amd_bytealign_S (w0[0], w0[1], offset); w1[1] = amd_bytealign_S ( 0, w0[0], offset); w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 6: w3[3] = amd_bytealign_S (w2[0], w2[1], offset); w3[2] = amd_bytealign_S (w1[3], w2[0], offset); w3[1] = amd_bytealign_S (w1[2], w1[3], offset); w3[0] = amd_bytealign_S (w1[1], w1[2], offset); w2[3] = amd_bytealign_S (w1[0], w1[1], offset); w2[2] = amd_bytealign_S (w0[3], w1[0], offset); w2[1] = amd_bytealign_S (w0[2], w0[3], offset); w2[0] = amd_bytealign_S (w0[1], w0[2], offset); w1[3] = amd_bytealign_S (w0[0], w0[1], offset); w1[2] = amd_bytealign_S ( 0, w0[0], offset); w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 7: w3[3] = amd_bytealign_S (w1[3], w2[0], offset); w3[2] = amd_bytealign_S (w1[2], w1[3], offset); w3[1] = amd_bytealign_S (w1[1], w1[2], offset); w3[0] = amd_bytealign_S (w1[0], w1[1], offset); w2[3] = amd_bytealign_S (w0[3], w1[0], offset); w2[2] = amd_bytealign_S (w0[2], w0[3], offset); w2[1] = amd_bytealign_S (w0[1], w0[2], offset); w2[0] = amd_bytealign_S (w0[0], w0[1], offset); w1[3] = amd_bytealign_S ( 0, w0[0], offset); w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 8: w3[3] = amd_bytealign_S (w1[2], w1[3], offset); w3[2] = amd_bytealign_S (w1[1], w1[2], offset); w3[1] = amd_bytealign_S (w1[0], w1[1], offset); w3[0] = amd_bytealign_S (w0[3], w1[0], offset); w2[3] = amd_bytealign_S (w0[2], w0[3], offset); w2[2] = amd_bytealign_S (w0[1], w0[2], offset); w2[1] = amd_bytealign_S (w0[0], w0[1], offset); w2[0] = amd_bytealign_S ( 0, w0[0], offset); w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 9: w3[3] = amd_bytealign_S (w1[1], w1[2], offset); w3[2] = amd_bytealign_S (w1[0], w1[1], offset); w3[1] = amd_bytealign_S (w0[3], w1[0], offset); w3[0] = amd_bytealign_S (w0[2], w0[3], offset); w2[3] = amd_bytealign_S (w0[1], w0[2], offset); w2[2] = amd_bytealign_S (w0[0], w0[1], offset); w2[1] = amd_bytealign_S ( 0, w0[0], offset); w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 10: w3[3] = amd_bytealign_S (w1[0], w1[1], offset); w3[2] = amd_bytealign_S (w0[3], w1[0], offset); w3[1] = amd_bytealign_S (w0[2], w0[3], offset); w3[0] = amd_bytealign_S (w0[1], w0[2], offset); w2[3] = amd_bytealign_S (w0[0], w0[1], offset); w2[2] = amd_bytealign_S ( 0, w0[0], offset); w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 11: w3[3] = amd_bytealign_S (w0[3], w1[0], offset); w3[2] = amd_bytealign_S (w0[2], w0[3], offset); w3[1] = amd_bytealign_S (w0[1], w0[2], offset); w3[0] = amd_bytealign_S (w0[0], w0[1], offset); w2[3] = amd_bytealign_S ( 0, w0[0], offset); w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 12: w3[3] = amd_bytealign_S (w0[2], w0[3], offset); w3[2] = amd_bytealign_S (w0[1], w0[2], offset); w3[1] = amd_bytealign_S (w0[0], w0[1], offset); w3[0] = amd_bytealign_S ( 0, w0[0], offset); w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 13: w3[3] = amd_bytealign_S (w0[1], w0[2], offset); w3[2] = amd_bytealign_S (w0[0], w0[1], offset); w3[1] = amd_bytealign_S ( 0, w0[0], offset); w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 14: w3[3] = amd_bytealign_S (w0[0], w0[1], offset); w3[2] = amd_bytealign_S ( 0, w0[0], offset); w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 15: w3[3] = amd_bytealign_S ( 0, w0[0], offset); w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; } w0[0] = swap32_S (w0[0]); w0[1] = swap32_S (w0[1]); w0[2] = swap32_S (w0[2]); w0[3] = swap32_S (w0[3]); w1[0] = swap32_S (w1[0]); w1[1] = swap32_S (w1[1]); w1[2] = swap32_S (w1[2]); w1[3] = swap32_S (w1[3]); w2[0] = swap32_S (w2[0]); w2[1] = swap32_S (w2[1]); w2[2] = swap32_S (w2[2]); w2[3] = swap32_S (w2[3]); w3[0] = swap32_S (w3[0]); w3[1] = swap32_S (w3[1]); w3[2] = swap32_S (w3[2]); w3[3] = swap32_S (w3[3]); #endif #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV #if defined IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; #endif #if defined IS_AMD const int selector = 0x0706050403020100 >> (offset_minus_4 * 8); #endif switch (offset_switch) { case 0: w3[3] = __byte_perm_S (w3[2], w3[3], selector); w3[2] = __byte_perm_S (w3[1], w3[2], selector); w3[1] = __byte_perm_S (w3[0], w3[1], selector); w3[0] = __byte_perm_S (w2[3], w3[0], selector); w2[3] = __byte_perm_S (w2[2], w2[3], selector); w2[2] = __byte_perm_S (w2[1], w2[2], selector); w2[1] = __byte_perm_S (w2[0], w2[1], selector); w2[0] = __byte_perm_S (w1[3], w2[0], selector); w1[3] = __byte_perm_S (w1[2], w1[3], selector); w1[2] = __byte_perm_S (w1[1], w1[2], selector); w1[1] = __byte_perm_S (w1[0], w1[1], selector); w1[0] = __byte_perm_S (w0[3], w1[0], selector); w0[3] = __byte_perm_S (w0[2], w0[3], selector); w0[2] = __byte_perm_S (w0[1], w0[2], selector); w0[1] = __byte_perm_S (w0[0], w0[1], selector); w0[0] = __byte_perm_S ( 0, w0[0], selector); break; case 1: w3[3] = __byte_perm_S (w3[1], w3[2], selector); w3[2] = __byte_perm_S (w3[0], w3[1], selector); w3[1] = __byte_perm_S (w2[3], w3[0], selector); w3[0] = __byte_perm_S (w2[2], w2[3], selector); w2[3] = __byte_perm_S (w2[1], w2[2], selector); w2[2] = __byte_perm_S (w2[0], w2[1], selector); w2[1] = __byte_perm_S (w1[3], w2[0], selector); w2[0] = __byte_perm_S (w1[2], w1[3], selector); w1[3] = __byte_perm_S (w1[1], w1[2], selector); w1[2] = __byte_perm_S (w1[0], w1[1], selector); w1[1] = __byte_perm_S (w0[3], w1[0], selector); w1[0] = __byte_perm_S (w0[2], w0[3], selector); w0[3] = __byte_perm_S (w0[1], w0[2], selector); w0[2] = __byte_perm_S (w0[0], w0[1], selector); w0[1] = __byte_perm_S ( 0, w0[0], selector); w0[0] = 0; break; case 2: w3[3] = __byte_perm_S (w3[0], w3[1], selector); w3[2] = __byte_perm_S (w2[3], w3[0], selector); w3[1] = __byte_perm_S (w2[2], w2[3], selector); w3[0] = __byte_perm_S (w2[1], w2[2], selector); w2[3] = __byte_perm_S (w2[0], w2[1], selector); w2[2] = __byte_perm_S (w1[3], w2[0], selector); w2[1] = __byte_perm_S (w1[2], w1[3], selector); w2[0] = __byte_perm_S (w1[1], w1[2], selector); w1[3] = __byte_perm_S (w1[0], w1[1], selector); w1[2] = __byte_perm_S (w0[3], w1[0], selector); w1[1] = __byte_perm_S (w0[2], w0[3], selector); w1[0] = __byte_perm_S (w0[1], w0[2], selector); w0[3] = __byte_perm_S (w0[0], w0[1], selector); w0[2] = __byte_perm_S ( 0, w0[0], selector); w0[1] = 0; w0[0] = 0; break; case 3: w3[3] = __byte_perm_S (w2[3], w3[0], selector); w3[2] = __byte_perm_S (w2[2], w2[3], selector); w3[1] = __byte_perm_S (w2[1], w2[2], selector); w3[0] = __byte_perm_S (w2[0], w2[1], selector); w2[3] = __byte_perm_S (w1[3], w2[0], selector); w2[2] = __byte_perm_S (w1[2], w1[3], selector); w2[1] = __byte_perm_S (w1[1], w1[2], selector); w2[0] = __byte_perm_S (w1[0], w1[1], selector); w1[3] = __byte_perm_S (w0[3], w1[0], selector); w1[2] = __byte_perm_S (w0[2], w0[3], selector); w1[1] = __byte_perm_S (w0[1], w0[2], selector); w1[0] = __byte_perm_S (w0[0], w0[1], selector); w0[3] = __byte_perm_S ( 0, w0[0], selector); w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 4: w3[3] = __byte_perm_S (w2[2], w2[3], selector); w3[2] = __byte_perm_S (w2[1], w2[2], selector); w3[1] = __byte_perm_S (w2[0], w2[1], selector); w3[0] = __byte_perm_S (w1[3], w2[0], selector); w2[3] = __byte_perm_S (w1[2], w1[3], selector); w2[2] = __byte_perm_S (w1[1], w1[2], selector); w2[1] = __byte_perm_S (w1[0], w1[1], selector); w2[0] = __byte_perm_S (w0[3], w1[0], selector); w1[3] = __byte_perm_S (w0[2], w0[3], selector); w1[2] = __byte_perm_S (w0[1], w0[2], selector); w1[1] = __byte_perm_S (w0[0], w0[1], selector); w1[0] = __byte_perm_S ( 0, w0[0], selector); w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 5: w3[3] = __byte_perm_S (w2[1], w2[2], selector); w3[2] = __byte_perm_S (w2[0], w2[1], selector); w3[1] = __byte_perm_S (w1[3], w2[0], selector); w3[0] = __byte_perm_S (w1[2], w1[3], selector); w2[3] = __byte_perm_S (w1[1], w1[2], selector); w2[2] = __byte_perm_S (w1[0], w1[1], selector); w2[1] = __byte_perm_S (w0[3], w1[0], selector); w2[0] = __byte_perm_S (w0[2], w0[3], selector); w1[3] = __byte_perm_S (w0[1], w0[2], selector); w1[2] = __byte_perm_S (w0[0], w0[1], selector); w1[1] = __byte_perm_S ( 0, w0[0], selector); w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 6: w3[3] = __byte_perm_S (w2[0], w2[1], selector); w3[2] = __byte_perm_S (w1[3], w2[0], selector); w3[1] = __byte_perm_S (w1[2], w1[3], selector); w3[0] = __byte_perm_S (w1[1], w1[2], selector); w2[3] = __byte_perm_S (w1[0], w1[1], selector); w2[2] = __byte_perm_S (w0[3], w1[0], selector); w2[1] = __byte_perm_S (w0[2], w0[3], selector); w2[0] = __byte_perm_S (w0[1], w0[2], selector); w1[3] = __byte_perm_S (w0[0], w0[1], selector); w1[2] = __byte_perm_S ( 0, w0[0], selector); w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 7: w3[3] = __byte_perm_S (w1[3], w2[0], selector); w3[2] = __byte_perm_S (w1[2], w1[3], selector); w3[1] = __byte_perm_S (w1[1], w1[2], selector); w3[0] = __byte_perm_S (w1[0], w1[1], selector); w2[3] = __byte_perm_S (w0[3], w1[0], selector); w2[2] = __byte_perm_S (w0[2], w0[3], selector); w2[1] = __byte_perm_S (w0[1], w0[2], selector); w2[0] = __byte_perm_S (w0[0], w0[1], selector); w1[3] = __byte_perm_S ( 0, w0[0], selector); w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 8: w3[3] = __byte_perm_S (w1[2], w1[3], selector); w3[2] = __byte_perm_S (w1[1], w1[2], selector); w3[1] = __byte_perm_S (w1[0], w1[1], selector); w3[0] = __byte_perm_S (w0[3], w1[0], selector); w2[3] = __byte_perm_S (w0[2], w0[3], selector); w2[2] = __byte_perm_S (w0[1], w0[2], selector); w2[1] = __byte_perm_S (w0[0], w0[1], selector); w2[0] = __byte_perm_S ( 0, w0[0], selector); w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 9: w3[3] = __byte_perm_S (w1[1], w1[2], selector); w3[2] = __byte_perm_S (w1[0], w1[1], selector); w3[1] = __byte_perm_S (w0[3], w1[0], selector); w3[0] = __byte_perm_S (w0[2], w0[3], selector); w2[3] = __byte_perm_S (w0[1], w0[2], selector); w2[2] = __byte_perm_S (w0[0], w0[1], selector); w2[1] = __byte_perm_S ( 0, w0[0], selector); w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 10: w3[3] = __byte_perm_S (w1[0], w1[1], selector); w3[2] = __byte_perm_S (w0[3], w1[0], selector); w3[1] = __byte_perm_S (w0[2], w0[3], selector); w3[0] = __byte_perm_S (w0[1], w0[2], selector); w2[3] = __byte_perm_S (w0[0], w0[1], selector); w2[2] = __byte_perm_S ( 0, w0[0], selector); w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 11: w3[3] = __byte_perm_S (w0[3], w1[0], selector); w3[2] = __byte_perm_S (w0[2], w0[3], selector); w3[1] = __byte_perm_S (w0[1], w0[2], selector); w3[0] = __byte_perm_S (w0[0], w0[1], selector); w2[3] = __byte_perm_S ( 0, w0[0], selector); w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 12: w3[3] = __byte_perm_S (w0[2], w0[3], selector); w3[2] = __byte_perm_S (w0[1], w0[2], selector); w3[1] = __byte_perm_S (w0[0], w0[1], selector); w3[0] = __byte_perm_S ( 0, w0[0], selector); w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 13: w3[3] = __byte_perm_S (w0[1], w0[2], selector); w3[2] = __byte_perm_S (w0[0], w0[1], selector); w3[1] = __byte_perm_S ( 0, w0[0], selector); w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 14: w3[3] = __byte_perm_S (w0[0], w0[1], selector); w3[2] = __byte_perm_S ( 0, w0[0], selector); w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 15: w3[3] = __byte_perm_S ( 0, w0[0], selector); w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; } #endif } DECLSPEC void switch_buffer_by_offset_carry_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 c0[4], u32 c1[4], u32 c2[4], u32 c3[4], const u32 offset) { const int offset_mod_4 = offset & 3; const int offset_minus_4 = 4 - offset_mod_4; const int offset_switch = offset / 4; #if defined IS_AMD || defined IS_GENERIC w0[0] = swap32_S (w0[0]); w0[1] = swap32_S (w0[1]); w0[2] = swap32_S (w0[2]); w0[3] = swap32_S (w0[3]); w1[0] = swap32_S (w1[0]); w1[1] = swap32_S (w1[1]); w1[2] = swap32_S (w1[2]); w1[3] = swap32_S (w1[3]); w2[0] = swap32_S (w2[0]); w2[1] = swap32_S (w2[1]); w2[2] = swap32_S (w2[2]); w2[3] = swap32_S (w2[3]); w3[0] = swap32_S (w3[0]); w3[1] = swap32_S (w3[1]); w3[2] = swap32_S (w3[2]); w3[3] = swap32_S (w3[3]); switch (offset_switch) { case 0: c0[0] = amd_bytealign_S (w3[3], 0, offset); w3[3] = amd_bytealign_S (w3[2], w3[3], offset); w3[2] = amd_bytealign_S (w3[1], w3[2], offset); w3[1] = amd_bytealign_S (w3[0], w3[1], offset); w3[0] = amd_bytealign_S (w2[3], w3[0], offset); w2[3] = amd_bytealign_S (w2[2], w2[3], offset); w2[2] = amd_bytealign_S (w2[1], w2[2], offset); w2[1] = amd_bytealign_S (w2[0], w2[1], offset); w2[0] = amd_bytealign_S (w1[3], w2[0], offset); w1[3] = amd_bytealign_S (w1[2], w1[3], offset); w1[2] = amd_bytealign_S (w1[1], w1[2], offset); w1[1] = amd_bytealign_S (w1[0], w1[1], offset); w1[0] = amd_bytealign_S (w0[3], w1[0], offset); w0[3] = amd_bytealign_S (w0[2], w0[3], offset); w0[2] = amd_bytealign_S (w0[1], w0[2], offset); w0[1] = amd_bytealign_S (w0[0], w0[1], offset); w0[0] = amd_bytealign_S ( 0, w0[0], offset); break; case 1: c0[1] = amd_bytealign_S (w3[3], 0, offset); c0[0] = amd_bytealign_S (w3[2], w3[3], offset); w3[3] = amd_bytealign_S (w3[1], w3[2], offset); w3[2] = amd_bytealign_S (w3[0], w3[1], offset); w3[1] = amd_bytealign_S (w2[3], w3[0], offset); w3[0] = amd_bytealign_S (w2[2], w2[3], offset); w2[3] = amd_bytealign_S (w2[1], w2[2], offset); w2[2] = amd_bytealign_S (w2[0], w2[1], offset); w2[1] = amd_bytealign_S (w1[3], w2[0], offset); w2[0] = amd_bytealign_S (w1[2], w1[3], offset); w1[3] = amd_bytealign_S (w1[1], w1[2], offset); w1[2] = amd_bytealign_S (w1[0], w1[1], offset); w1[1] = amd_bytealign_S (w0[3], w1[0], offset); w1[0] = amd_bytealign_S (w0[2], w0[3], offset); w0[3] = amd_bytealign_S (w0[1], w0[2], offset); w0[2] = amd_bytealign_S (w0[0], w0[1], offset); w0[1] = amd_bytealign_S ( 0, w0[0], offset); w0[0] = 0; break; case 2: c0[2] = amd_bytealign_S (w3[3], 0, offset); c0[1] = amd_bytealign_S (w3[2], w3[3], offset); c0[0] = amd_bytealign_S (w3[1], w3[2], offset); w3[3] = amd_bytealign_S (w3[0], w3[1], offset); w3[2] = amd_bytealign_S (w2[3], w3[0], offset); w3[1] = amd_bytealign_S (w2[2], w2[3], offset); w3[0] = amd_bytealign_S (w2[1], w2[2], offset); w2[3] = amd_bytealign_S (w2[0], w2[1], offset); w2[2] = amd_bytealign_S (w1[3], w2[0], offset); w2[1] = amd_bytealign_S (w1[2], w1[3], offset); w2[0] = amd_bytealign_S (w1[1], w1[2], offset); w1[3] = amd_bytealign_S (w1[0], w1[1], offset); w1[2] = amd_bytealign_S (w0[3], w1[0], offset); w1[1] = amd_bytealign_S (w0[2], w0[3], offset); w1[0] = amd_bytealign_S (w0[1], w0[2], offset); w0[3] = amd_bytealign_S (w0[0], w0[1], offset); w0[2] = amd_bytealign_S ( 0, w0[0], offset); w0[1] = 0; w0[0] = 0; break; case 3: c0[3] = amd_bytealign_S (w3[3], 0, offset); c0[2] = amd_bytealign_S (w3[2], w3[3], offset); c0[1] = amd_bytealign_S (w3[1], w3[2], offset); c0[0] = amd_bytealign_S (w3[0], w3[1], offset); w3[3] = amd_bytealign_S (w2[3], w3[0], offset); w3[2] = amd_bytealign_S (w2[2], w2[3], offset); w3[1] = amd_bytealign_S (w2[1], w2[2], offset); w3[0] = amd_bytealign_S (w2[0], w2[1], offset); w2[3] = amd_bytealign_S (w1[3], w2[0], offset); w2[2] = amd_bytealign_S (w1[2], w1[3], offset); w2[1] = amd_bytealign_S (w1[1], w1[2], offset); w2[0] = amd_bytealign_S (w1[0], w1[1], offset); w1[3] = amd_bytealign_S (w0[3], w1[0], offset); w1[2] = amd_bytealign_S (w0[2], w0[3], offset); w1[1] = amd_bytealign_S (w0[1], w0[2], offset); w1[0] = amd_bytealign_S (w0[0], w0[1], offset); w0[3] = amd_bytealign_S ( 0, w0[0], offset); w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 4: c1[0] = amd_bytealign_S (w3[3], 0, offset); c0[3] = amd_bytealign_S (w3[2], w3[3], offset); c0[2] = amd_bytealign_S (w3[1], w3[2], offset); c0[1] = amd_bytealign_S (w3[0], w3[1], offset); c0[0] = amd_bytealign_S (w2[3], w3[0], offset); w3[3] = amd_bytealign_S (w2[2], w2[3], offset); w3[2] = amd_bytealign_S (w2[1], w2[2], offset); w3[1] = amd_bytealign_S (w2[0], w2[1], offset); w3[0] = amd_bytealign_S (w1[3], w2[0], offset); w2[3] = amd_bytealign_S (w1[2], w1[3], offset); w2[2] = amd_bytealign_S (w1[1], w1[2], offset); w2[1] = amd_bytealign_S (w1[0], w1[1], offset); w2[0] = amd_bytealign_S (w0[3], w1[0], offset); w1[3] = amd_bytealign_S (w0[2], w0[3], offset); w1[2] = amd_bytealign_S (w0[1], w0[2], offset); w1[1] = amd_bytealign_S (w0[0], w0[1], offset); w1[0] = amd_bytealign_S ( 0, w0[0], offset); w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 5: c1[1] = amd_bytealign_S (w3[3], 0, offset); c1[0] = amd_bytealign_S (w3[2], w3[3], offset); c0[3] = amd_bytealign_S (w3[1], w3[2], offset); c0[2] = amd_bytealign_S (w3[0], w3[1], offset); c0[1] = amd_bytealign_S (w2[3], w3[0], offset); c0[0] = amd_bytealign_S (w2[2], w2[3], offset); w3[3] = amd_bytealign_S (w2[1], w2[2], offset); w3[2] = amd_bytealign_S (w2[0], w2[1], offset); w3[1] = amd_bytealign_S (w1[3], w2[0], offset); w3[0] = amd_bytealign_S (w1[2], w1[3], offset); w2[3] = amd_bytealign_S (w1[1], w1[2], offset); w2[2] = amd_bytealign_S (w1[0], w1[1], offset); w2[1] = amd_bytealign_S (w0[3], w1[0], offset); w2[0] = amd_bytealign_S (w0[2], w0[3], offset); w1[3] = amd_bytealign_S (w0[1], w0[2], offset); w1[2] = amd_bytealign_S (w0[0], w0[1], offset); w1[1] = amd_bytealign_S ( 0, w0[0], offset); w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 6: c1[2] = amd_bytealign_S (w3[3], 0, offset); c1[1] = amd_bytealign_S (w3[2], w3[3], offset); c1[0] = amd_bytealign_S (w3[1], w3[2], offset); c0[3] = amd_bytealign_S (w3[0], w3[1], offset); c0[2] = amd_bytealign_S (w2[3], w3[0], offset); c0[1] = amd_bytealign_S (w2[2], w2[3], offset); c0[0] = amd_bytealign_S (w2[1], w2[2], offset); w3[3] = amd_bytealign_S (w2[0], w2[1], offset); w3[2] = amd_bytealign_S (w1[3], w2[0], offset); w3[1] = amd_bytealign_S (w1[2], w1[3], offset); w3[0] = amd_bytealign_S (w1[1], w1[2], offset); w2[3] = amd_bytealign_S (w1[0], w1[1], offset); w2[2] = amd_bytealign_S (w0[3], w1[0], offset); w2[1] = amd_bytealign_S (w0[2], w0[3], offset); w2[0] = amd_bytealign_S (w0[1], w0[2], offset); w1[3] = amd_bytealign_S (w0[0], w0[1], offset); w1[2] = amd_bytealign_S ( 0, w0[0], offset); w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 7: c1[3] = amd_bytealign_S (w3[3], 0, offset); c1[2] = amd_bytealign_S (w3[2], w3[3], offset); c1[1] = amd_bytealign_S (w3[1], w3[2], offset); c1[0] = amd_bytealign_S (w3[0], w3[1], offset); c0[3] = amd_bytealign_S (w2[3], w3[0], offset); c0[2] = amd_bytealign_S (w2[2], w2[3], offset); c0[1] = amd_bytealign_S (w2[1], w2[2], offset); c0[0] = amd_bytealign_S (w2[0], w2[1], offset); w3[3] = amd_bytealign_S (w1[3], w2[0], offset); w3[2] = amd_bytealign_S (w1[2], w1[3], offset); w3[1] = amd_bytealign_S (w1[1], w1[2], offset); w3[0] = amd_bytealign_S (w1[0], w1[1], offset); w2[3] = amd_bytealign_S (w0[3], w1[0], offset); w2[2] = amd_bytealign_S (w0[2], w0[3], offset); w2[1] = amd_bytealign_S (w0[1], w0[2], offset); w2[0] = amd_bytealign_S (w0[0], w0[1], offset); w1[3] = amd_bytealign_S ( 0, w0[0], offset); w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 8: c2[0] = amd_bytealign_S (w3[3], 0, offset); c1[3] = amd_bytealign_S (w3[2], w3[3], offset); c1[2] = amd_bytealign_S (w3[1], w3[2], offset); c1[1] = amd_bytealign_S (w3[0], w3[1], offset); c1[0] = amd_bytealign_S (w2[3], w3[0], offset); c0[3] = amd_bytealign_S (w2[2], w2[3], offset); c0[2] = amd_bytealign_S (w2[1], w2[2], offset); c0[1] = amd_bytealign_S (w2[0], w2[1], offset); c0[0] = amd_bytealign_S (w1[3], w2[0], offset); w3[3] = amd_bytealign_S (w1[2], w1[3], offset); w3[2] = amd_bytealign_S (w1[1], w1[2], offset); w3[1] = amd_bytealign_S (w1[0], w1[1], offset); w3[0] = amd_bytealign_S (w0[3], w1[0], offset); w2[3] = amd_bytealign_S (w0[2], w0[3], offset); w2[2] = amd_bytealign_S (w0[1], w0[2], offset); w2[1] = amd_bytealign_S (w0[0], w0[1], offset); w2[0] = amd_bytealign_S ( 0, w0[0], offset); w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 9: c2[1] = amd_bytealign_S (w3[3], 0, offset); c2[0] = amd_bytealign_S (w3[2], w3[3], offset); c1[3] = amd_bytealign_S (w3[1], w3[2], offset); c1[2] = amd_bytealign_S (w3[0], w3[1], offset); c1[1] = amd_bytealign_S (w2[3], w3[0], offset); c1[0] = amd_bytealign_S (w2[2], w2[3], offset); c0[3] = amd_bytealign_S (w2[1], w2[2], offset); c0[2] = amd_bytealign_S (w2[0], w2[1], offset); c0[1] = amd_bytealign_S (w1[3], w2[0], offset); c0[0] = amd_bytealign_S (w1[2], w1[3], offset); w3[3] = amd_bytealign_S (w1[1], w1[2], offset); w3[2] = amd_bytealign_S (w1[0], w1[1], offset); w3[1] = amd_bytealign_S (w0[3], w1[0], offset); w3[0] = amd_bytealign_S (w0[2], w0[3], offset); w2[3] = amd_bytealign_S (w0[1], w0[2], offset); w2[2] = amd_bytealign_S (w0[0], w0[1], offset); w2[1] = amd_bytealign_S ( 0, w0[0], offset); w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 10: c2[2] = amd_bytealign_S (w3[3], 0, offset); c2[1] = amd_bytealign_S (w3[2], w3[3], offset); c2[0] = amd_bytealign_S (w3[1], w3[2], offset); c1[3] = amd_bytealign_S (w3[0], w3[1], offset); c1[2] = amd_bytealign_S (w2[3], w3[0], offset); c1[1] = amd_bytealign_S (w2[2], w2[3], offset); c1[0] = amd_bytealign_S (w2[1], w2[2], offset); c0[3] = amd_bytealign_S (w2[0], w2[1], offset); c0[2] = amd_bytealign_S (w1[3], w2[0], offset); c0[1] = amd_bytealign_S (w1[2], w1[3], offset); c0[0] = amd_bytealign_S (w1[1], w1[2], offset); w3[3] = amd_bytealign_S (w1[0], w1[1], offset); w3[2] = amd_bytealign_S (w0[3], w1[0], offset); w3[1] = amd_bytealign_S (w0[2], w0[3], offset); w3[0] = amd_bytealign_S (w0[1], w0[2], offset); w2[3] = amd_bytealign_S (w0[0], w0[1], offset); w2[2] = amd_bytealign_S ( 0, w0[0], offset); w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 11: c2[3] = amd_bytealign_S (w3[3], 0, offset); c2[2] = amd_bytealign_S (w3[2], w3[3], offset); c2[1] = amd_bytealign_S (w3[1], w3[2], offset); c2[0] = amd_bytealign_S (w3[0], w3[1], offset); c1[3] = amd_bytealign_S (w2[3], w3[0], offset); c1[2] = amd_bytealign_S (w2[2], w2[3], offset); c1[1] = amd_bytealign_S (w2[1], w2[2], offset); c1[0] = amd_bytealign_S (w2[0], w2[1], offset); c0[3] = amd_bytealign_S (w1[3], w2[0], offset); c0[2] = amd_bytealign_S (w1[2], w1[3], offset); c0[1] = amd_bytealign_S (w1[1], w1[2], offset); c0[0] = amd_bytealign_S (w1[0], w1[1], offset); w3[3] = amd_bytealign_S (w0[3], w1[0], offset); w3[2] = amd_bytealign_S (w0[2], w0[3], offset); w3[1] = amd_bytealign_S (w0[1], w0[2], offset); w3[0] = amd_bytealign_S (w0[0], w0[1], offset); w2[3] = amd_bytealign_S ( 0, w0[0], offset); w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 12: c3[0] = amd_bytealign_S (w3[3], 0, offset); c2[3] = amd_bytealign_S (w3[2], w3[3], offset); c2[2] = amd_bytealign_S (w3[1], w3[2], offset); c2[1] = amd_bytealign_S (w3[0], w3[1], offset); c2[0] = amd_bytealign_S (w2[3], w3[0], offset); c1[3] = amd_bytealign_S (w2[2], w2[3], offset); c1[2] = amd_bytealign_S (w2[1], w2[2], offset); c1[1] = amd_bytealign_S (w2[0], w2[1], offset); c1[0] = amd_bytealign_S (w1[3], w2[0], offset); c0[3] = amd_bytealign_S (w1[2], w1[3], offset); c0[2] = amd_bytealign_S (w1[1], w1[2], offset); c0[1] = amd_bytealign_S (w1[0], w1[1], offset); c0[0] = amd_bytealign_S (w0[3], w1[0], offset); w3[3] = amd_bytealign_S (w0[2], w0[3], offset); w3[2] = amd_bytealign_S (w0[1], w0[2], offset); w3[1] = amd_bytealign_S (w0[0], w0[1], offset); w3[0] = amd_bytealign_S ( 0, w0[0], offset); w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 13: c3[1] = amd_bytealign_S (w3[3], 0, offset); c3[0] = amd_bytealign_S (w3[2], w3[3], offset); c2[3] = amd_bytealign_S (w3[1], w3[2], offset); c2[2] = amd_bytealign_S (w3[0], w3[1], offset); c2[1] = amd_bytealign_S (w2[3], w3[0], offset); c2[0] = amd_bytealign_S (w2[2], w2[3], offset); c1[3] = amd_bytealign_S (w2[1], w2[2], offset); c1[2] = amd_bytealign_S (w2[0], w2[1], offset); c1[1] = amd_bytealign_S (w1[3], w2[0], offset); c1[0] = amd_bytealign_S (w1[2], w1[3], offset); c0[3] = amd_bytealign_S (w1[1], w1[2], offset); c0[2] = amd_bytealign_S (w1[0], w1[1], offset); c0[1] = amd_bytealign_S (w0[3], w1[0], offset); c0[0] = amd_bytealign_S (w0[2], w0[3], offset); w3[3] = amd_bytealign_S (w0[1], w0[2], offset); w3[2] = amd_bytealign_S (w0[0], w0[1], offset); w3[1] = amd_bytealign_S ( 0, w0[0], offset); w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 14: c3[2] = amd_bytealign_S (w3[3], 0, offset); c3[1] = amd_bytealign_S (w3[2], w3[3], offset); c3[0] = amd_bytealign_S (w3[1], w3[2], offset); c2[3] = amd_bytealign_S (w3[0], w3[1], offset); c2[2] = amd_bytealign_S (w2[3], w3[0], offset); c2[1] = amd_bytealign_S (w2[2], w2[3], offset); c2[0] = amd_bytealign_S (w2[1], w2[2], offset); c1[3] = amd_bytealign_S (w2[0], w2[1], offset); c1[2] = amd_bytealign_S (w1[3], w2[0], offset); c1[1] = amd_bytealign_S (w1[2], w1[3], offset); c1[0] = amd_bytealign_S (w1[1], w1[2], offset); c0[3] = amd_bytealign_S (w1[0], w1[1], offset); c0[2] = amd_bytealign_S (w0[3], w1[0], offset); c0[1] = amd_bytealign_S (w0[2], w0[3], offset); c0[0] = amd_bytealign_S (w0[1], w0[2], offset); w3[3] = amd_bytealign_S (w0[0], w0[1], offset); w3[2] = amd_bytealign_S ( 0, w0[0], offset); w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 15: c3[3] = amd_bytealign_S (w3[3], 0, offset); c3[2] = amd_bytealign_S (w3[2], w3[3], offset); c3[1] = amd_bytealign_S (w3[1], w3[2], offset); c3[0] = amd_bytealign_S (w3[0], w3[1], offset); c2[3] = amd_bytealign_S (w2[3], w3[0], offset); c2[2] = amd_bytealign_S (w2[2], w2[3], offset); c2[1] = amd_bytealign_S (w2[1], w2[2], offset); c2[0] = amd_bytealign_S (w2[0], w2[1], offset); c1[3] = amd_bytealign_S (w1[3], w2[0], offset); c1[2] = amd_bytealign_S (w1[2], w1[3], offset); c1[1] = amd_bytealign_S (w1[1], w1[2], offset); c1[0] = amd_bytealign_S (w1[0], w1[1], offset); c0[3] = amd_bytealign_S (w0[3], w1[0], offset); c0[2] = amd_bytealign_S (w0[2], w0[3], offset); c0[1] = amd_bytealign_S (w0[1], w0[2], offset); c0[0] = amd_bytealign_S (w0[0], w0[1], offset); w3[3] = amd_bytealign_S ( 0, w0[0], offset); w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; } w0[0] = swap32_S (w0[0]); w0[1] = swap32_S (w0[1]); w0[2] = swap32_S (w0[2]); w0[3] = swap32_S (w0[3]); w1[0] = swap32_S (w1[0]); w1[1] = swap32_S (w1[1]); w1[2] = swap32_S (w1[2]); w1[3] = swap32_S (w1[3]); w2[0] = swap32_S (w2[0]); w2[1] = swap32_S (w2[1]); w2[2] = swap32_S (w2[2]); w2[3] = swap32_S (w2[3]); w3[0] = swap32_S (w3[0]); w3[1] = swap32_S (w3[1]); w3[2] = swap32_S (w3[2]); w3[3] = swap32_S (w3[3]); c0[0] = swap32_S (c0[0]); c0[1] = swap32_S (c0[1]); c0[2] = swap32_S (c0[2]); c0[3] = swap32_S (c0[3]); c1[0] = swap32_S (c1[0]); c1[1] = swap32_S (c1[1]); c1[2] = swap32_S (c1[2]); c1[3] = swap32_S (c1[3]); c2[0] = swap32_S (c2[0]); c2[1] = swap32_S (c2[1]); c2[2] = swap32_S (c2[2]); c2[3] = swap32_S (c2[3]); c3[0] = swap32_S (c3[0]); c3[1] = swap32_S (c3[1]); c3[2] = swap32_S (c3[2]); c3[3] = swap32_S (c3[3]); #endif #ifdef IS_NV // todo switch (offset_switch) { case 0: c0[0] = amd_bytealign_S ( 0, w3[3], offset_minus_4); w3[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); w3[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); w3[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); w3[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); w2[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); w2[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); w2[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); w2[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); w1[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); w1[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); w1[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); w1[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); w0[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); w0[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); w0[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); w0[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); if (offset_mod_4 == 0) { w0[0] = w0[1]; w0[1] = w0[2]; w0[2] = w0[3]; w0[3] = w1[0]; w1[0] = w1[1]; w1[1] = w1[2]; w1[2] = w1[3]; w1[3] = w2[0]; w2[0] = w2[1]; w2[1] = w2[2]; w2[2] = w2[3]; w2[3] = w3[0]; w3[0] = w3[1]; w3[1] = w3[2]; w3[2] = w3[3]; w3[3] = c0[0]; c0[0] = 0; } break; case 1: c0[1] = amd_bytealign_S ( 0, w3[3], offset_minus_4); c0[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); w3[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); w3[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); w3[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); w3[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); w2[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); w2[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); w2[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); w2[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); w1[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); w1[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); w1[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); w1[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); w0[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); w0[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); w0[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); w0[0] = 0; if (offset_mod_4 == 0) { w0[1] = w0[2]; w0[2] = w0[3]; w0[3] = w1[0]; w1[0] = w1[1]; w1[1] = w1[2]; w1[2] = w1[3]; w1[3] = w2[0]; w2[0] = w2[1]; w2[1] = w2[2]; w2[2] = w2[3]; w2[3] = w3[0]; w3[0] = w3[1]; w3[1] = w3[2]; w3[2] = w3[3]; w3[3] = c0[0]; c0[0] = c0[1]; c0[1] = 0; } break; case 2: c0[2] = amd_bytealign_S ( 0, w3[3], offset_minus_4); c0[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); c0[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); w3[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); w3[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); w3[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); w3[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); w2[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); w2[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); w2[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); w2[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); w1[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); w1[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); w1[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); w1[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); w0[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); w0[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); w0[1] = 0; w0[0] = 0; if (offset_mod_4 == 0) { w0[2] = w0[3]; w0[3] = w1[0]; w1[0] = w1[1]; w1[1] = w1[2]; w1[2] = w1[3]; w1[3] = w2[0]; w2[0] = w2[1]; w2[1] = w2[2]; w2[2] = w2[3]; w2[3] = w3[0]; w3[0] = w3[1]; w3[1] = w3[2]; w3[2] = w3[3]; w3[3] = c0[0]; c0[0] = c0[1]; c0[1] = c0[2]; c0[2] = 0; } break; case 3: c0[3] = amd_bytealign_S ( 0, w3[3], offset_minus_4); c0[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); c0[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); c0[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); w3[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); w3[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); w3[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); w3[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); w2[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); w2[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); w2[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); w2[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); w1[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); w1[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); w1[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); w1[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); w0[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); w0[2] = 0; w0[1] = 0; w0[0] = 0; if (offset_mod_4 == 0) { w0[3] = w1[0]; w1[0] = w1[1]; w1[1] = w1[2]; w1[2] = w1[3]; w1[3] = w2[0]; w2[0] = w2[1]; w2[1] = w2[2]; w2[2] = w2[3]; w2[3] = w3[0]; w3[0] = w3[1]; w3[1] = w3[2]; w3[2] = w3[3]; w3[3] = c0[0]; c0[0] = c0[1]; c0[1] = c0[2]; c0[2] = c0[3]; c0[3] = 0; } break; case 4: c1[0] = amd_bytealign_S ( 0, w3[3], offset_minus_4); c0[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); c0[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); c0[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); c0[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); w3[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); w3[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); w3[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); w3[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); w2[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); w2[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); w2[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); w2[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); w1[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); w1[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); w1[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); w1[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; if (offset_mod_4 == 0) { w1[0] = w1[1]; w1[1] = w1[2]; w1[2] = w1[3]; w1[3] = w2[0]; w2[0] = w2[1]; w2[1] = w2[2]; w2[2] = w2[3]; w2[3] = w3[0]; w3[0] = w3[1]; w3[1] = w3[2]; w3[2] = w3[3]; w3[3] = c0[0]; c0[0] = c0[1]; c0[1] = c0[2]; c0[2] = c0[3]; c0[3] = c1[0]; c1[0] = 0; } break; case 5: c1[1] = amd_bytealign_S ( 0, w3[3], offset_minus_4); c1[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); c0[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); c0[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); c0[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); c0[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); w3[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); w3[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); w3[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); w3[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); w2[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); w2[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); w2[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); w2[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); w1[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); w1[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); w1[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; if (offset_mod_4 == 0) { w1[1] = w1[2]; w1[2] = w1[3]; w1[3] = w2[0]; w2[0] = w2[1]; w2[1] = w2[2]; w2[2] = w2[3]; w2[3] = w3[0]; w3[0] = w3[1]; w3[1] = w3[2]; w3[2] = w3[3]; w3[3] = c0[0]; c0[0] = c0[1]; c0[1] = c0[2]; c0[2] = c0[3]; c0[3] = c1[0]; c1[0] = c1[1]; c1[1] = 0; } break; case 6: c1[2] = amd_bytealign_S ( 0, w3[3], offset_minus_4); c1[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); c1[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); c0[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); c0[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); c0[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); c0[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); w3[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); w3[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); w3[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); w3[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); w2[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); w2[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); w2[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); w2[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); w1[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); w1[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; if (offset_mod_4 == 0) { w1[2] = w1[3]; w1[3] = w2[0]; w2[0] = w2[1]; w2[1] = w2[2]; w2[2] = w2[3]; w2[3] = w3[0]; w3[0] = w3[1]; w3[1] = w3[2]; w3[2] = w3[3]; w3[3] = c0[0]; c0[0] = c0[1]; c0[1] = c0[2]; c0[2] = c0[3]; c0[3] = c1[0]; c1[0] = c1[1]; c1[1] = c1[2]; c1[2] = 0; } break; case 7: c1[3] = amd_bytealign_S ( 0, w3[3], offset_minus_4); c1[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); c1[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); c1[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); c0[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); c0[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); c0[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); c0[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); w3[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); w3[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); w3[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); w3[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); w2[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); w2[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); w2[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); w2[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); w1[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; if (offset_mod_4 == 0) { w1[3] = w2[0]; w2[0] = w2[1]; w2[1] = w2[2]; w2[2] = w2[3]; w2[3] = w3[0]; w3[0] = w3[1]; w3[1] = w3[2]; w3[2] = w3[3]; w3[3] = c0[0]; c0[0] = c0[1]; c0[1] = c0[2]; c0[2] = c0[3]; c0[3] = c1[0]; c1[0] = c1[1]; c1[1] = c1[2]; c1[2] = c1[3]; c1[3] = 0; } break; case 8: c2[0] = amd_bytealign_S ( 0, w3[3], offset_minus_4); c1[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); c1[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); c1[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); c1[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); c0[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); c0[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); c0[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); c0[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); w3[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); w3[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); w3[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); w3[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); w2[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); w2[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); w2[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); w2[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; if (offset_mod_4 == 0) { w2[0] = w2[1]; w2[1] = w2[2]; w2[2] = w2[3]; w2[3] = w3[0]; w3[0] = w3[1]; w3[1] = w3[2]; w3[2] = w3[3]; w3[3] = c0[0]; c0[0] = c0[1]; c0[1] = c0[2]; c0[2] = c0[3]; c0[3] = c1[0]; c1[0] = c1[1]; c1[1] = c1[2]; c1[2] = c1[3]; c1[3] = c2[0]; c2[0] = 0; } break; case 9: c2[1] = amd_bytealign_S ( 0, w3[3], offset_minus_4); c2[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); c1[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); c1[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); c1[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); c1[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); c0[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); c0[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); c0[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); c0[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); w3[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); w3[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); w3[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); w3[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); w2[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); w2[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); w2[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; if (offset_mod_4 == 0) { w2[1] = w2[2]; w2[2] = w2[3]; w2[3] = w3[0]; w3[0] = w3[1]; w3[1] = w3[2]; w3[2] = w3[3]; w3[3] = c0[0]; c0[0] = c0[1]; c0[1] = c0[2]; c0[2] = c0[3]; c0[3] = c1[0]; c1[0] = c1[1]; c1[1] = c1[2]; c1[2] = c1[3]; c1[3] = c2[0]; c2[0] = c2[1]; c2[1] = 0; } break; case 10: c2[2] = amd_bytealign_S ( 0, w3[3], offset_minus_4); c2[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); c2[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); c1[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); c1[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); c1[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); c1[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); c0[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); c0[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); c0[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); c0[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); w3[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); w3[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); w3[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); w3[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); w2[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); w2[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; if (offset_mod_4 == 0) { w2[2] = w2[3]; w2[3] = w3[0]; w3[0] = w3[1]; w3[1] = w3[2]; w3[2] = w3[3]; w3[3] = c0[0]; c0[0] = c0[1]; c0[1] = c0[2]; c0[2] = c0[3]; c0[3] = c1[0]; c1[0] = c1[1]; c1[1] = c1[2]; c1[2] = c1[3]; c1[3] = c2[0]; c2[0] = c2[1]; c2[1] = c2[2]; c2[2] = 0; } break; case 11: c2[3] = amd_bytealign_S ( 0, w3[3], offset_minus_4); c2[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); c2[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); c2[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); c1[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); c1[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); c1[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); c1[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); c0[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); c0[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); c0[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); c0[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); w3[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); w3[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); w3[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); w3[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); w2[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; if (offset_mod_4 == 0) { w2[3] = w3[0]; w3[0] = w3[1]; w3[1] = w3[2]; w3[2] = w3[3]; w3[3] = c0[0]; c0[0] = c0[1]; c0[1] = c0[2]; c0[2] = c0[3]; c0[3] = c1[0]; c1[0] = c1[1]; c1[1] = c1[2]; c1[2] = c1[3]; c1[3] = c2[0]; c2[0] = c2[1]; c2[1] = c2[2]; c2[2] = c2[3]; c2[3] = 0; } break; case 12: c3[0] = amd_bytealign_S ( 0, w3[3], offset_minus_4); c2[3] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); c2[2] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); c2[1] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); c2[0] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); c1[3] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); c1[2] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); c1[1] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); c1[0] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); c0[3] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); c0[2] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); c0[1] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); c0[0] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); w3[3] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); w3[2] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); w3[1] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); w3[0] = amd_bytealign_S (w0[0], 0, offset_minus_4); w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; if (offset_mod_4 == 0) { w3[0] = w3[1]; w3[1] = w3[2]; w3[2] = w3[3]; w3[3] = c0[0]; c0[0] = c0[1]; c0[1] = c0[2]; c0[2] = c0[3]; c0[3] = c1[0]; c1[0] = c1[1]; c1[1] = c1[2]; c1[2] = c1[3]; c1[3] = c2[0]; c2[0] = c2[1]; c2[1] = c2[2]; c2[2] = c2[3]; c2[3] = c3[0]; c3[0] = 0; } break; case 13: c3[1] = amd_bytealign_S ( 0, w3[3], offset_minus_4); c3[0] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); c2[3] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); c2[2] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); c2[1] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); c2[0] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); c1[3] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); c1[2] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); c1[1] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); c1[0] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); c0[3] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); c0[2] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); c0[1] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); c0[0] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); w3[3] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); w3[2] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); w3[1] = amd_bytealign_S (w0[0], 0, offset_minus_4); w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; if (offset_mod_4 == 0) { w3[1] = w3[2]; w3[2] = w3[3]; w3[3] = c0[0]; c0[0] = c0[1]; c0[1] = c0[2]; c0[2] = c0[3]; c0[3] = c1[0]; c1[0] = c1[1]; c1[1] = c1[2]; c1[2] = c1[3]; c1[3] = c2[0]; c2[0] = c2[1]; c2[1] = c2[2]; c2[2] = c2[3]; c2[3] = c3[0]; c3[0] = c3[1]; c3[1] = 0; } break; case 14: c3[2] = amd_bytealign_S ( 0, w3[3], offset_minus_4); c3[1] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); c3[0] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); c2[3] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); c2[2] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); c2[1] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); c2[0] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); c1[3] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); c1[2] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); c1[1] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); c1[0] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); c0[3] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); c0[2] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); c0[1] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); c0[0] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); w3[3] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); w3[2] = amd_bytealign_S (w0[0], 0, offset_minus_4); w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; if (offset_mod_4 == 0) { w3[2] = w3[3]; w3[3] = c0[0]; c0[0] = c0[1]; c0[1] = c0[2]; c0[2] = c0[3]; c0[3] = c1[0]; c1[0] = c1[1]; c1[1] = c1[2]; c1[2] = c1[3]; c1[3] = c2[0]; c2[0] = c2[1]; c2[1] = c2[2]; c2[2] = c2[3]; c2[3] = c3[0]; c3[0] = c3[1]; c3[1] = c3[2]; c3[2] = 0; } break; case 15: c3[3] = amd_bytealign_S ( 0, w3[3], offset_minus_4); c3[2] = amd_bytealign_S (w3[3], w3[2], offset_minus_4); c3[1] = amd_bytealign_S (w3[2], w3[1], offset_minus_4); c3[0] = amd_bytealign_S (w3[1], w3[0], offset_minus_4); c2[3] = amd_bytealign_S (w3[0], w2[3], offset_minus_4); c2[2] = amd_bytealign_S (w2[3], w2[2], offset_minus_4); c2[1] = amd_bytealign_S (w2[2], w2[1], offset_minus_4); c2[0] = amd_bytealign_S (w2[1], w2[0], offset_minus_4); c1[3] = amd_bytealign_S (w2[0], w1[3], offset_minus_4); c1[2] = amd_bytealign_S (w1[3], w1[2], offset_minus_4); c1[1] = amd_bytealign_S (w1[2], w1[1], offset_minus_4); c1[0] = amd_bytealign_S (w1[1], w1[0], offset_minus_4); c0[3] = amd_bytealign_S (w1[0], w0[3], offset_minus_4); c0[2] = amd_bytealign_S (w0[3], w0[2], offset_minus_4); c0[1] = amd_bytealign_S (w0[2], w0[1], offset_minus_4); c0[0] = amd_bytealign_S (w0[1], w0[0], offset_minus_4); w3[3] = amd_bytealign_S (w0[0], 0, offset_minus_4); w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; if (offset_mod_4 == 0) { w3[3] = c0[0]; c0[0] = c0[1]; c0[1] = c0[2]; c0[2] = c0[3]; c0[3] = c1[0]; c1[0] = c1[1]; c1[1] = c1[2]; c1[2] = c1[3]; c1[3] = c2[0]; c2[0] = c2[1]; c2[1] = c2[2]; c2[2] = c2[3]; c2[3] = c3[0]; c3[0] = c3[1]; c3[1] = c3[2]; c3[2] = c3[3]; c3[3] = 0; } break; } #endif } DECLSPEC void switch_buffer_by_offset_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], const u32 offset) { const int offset_switch = offset / 4; #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC switch (offset_switch) { case 0: w3[3] = amd_bytealign_S (w3[2], w3[3], offset); w3[2] = amd_bytealign_S (w3[1], w3[2], offset); w3[1] = amd_bytealign_S (w3[0], w3[1], offset); w3[0] = amd_bytealign_S (w2[3], w3[0], offset); w2[3] = amd_bytealign_S (w2[2], w2[3], offset); w2[2] = amd_bytealign_S (w2[1], w2[2], offset); w2[1] = amd_bytealign_S (w2[0], w2[1], offset); w2[0] = amd_bytealign_S (w1[3], w2[0], offset); w1[3] = amd_bytealign_S (w1[2], w1[3], offset); w1[2] = amd_bytealign_S (w1[1], w1[2], offset); w1[1] = amd_bytealign_S (w1[0], w1[1], offset); w1[0] = amd_bytealign_S (w0[3], w1[0], offset); w0[3] = amd_bytealign_S (w0[2], w0[3], offset); w0[2] = amd_bytealign_S (w0[1], w0[2], offset); w0[1] = amd_bytealign_S (w0[0], w0[1], offset); w0[0] = amd_bytealign_S ( 0, w0[0], offset); break; case 1: w3[3] = amd_bytealign_S (w3[1], w3[2], offset); w3[2] = amd_bytealign_S (w3[0], w3[1], offset); w3[1] = amd_bytealign_S (w2[3], w3[0], offset); w3[0] = amd_bytealign_S (w2[2], w2[3], offset); w2[3] = amd_bytealign_S (w2[1], w2[2], offset); w2[2] = amd_bytealign_S (w2[0], w2[1], offset); w2[1] = amd_bytealign_S (w1[3], w2[0], offset); w2[0] = amd_bytealign_S (w1[2], w1[3], offset); w1[3] = amd_bytealign_S (w1[1], w1[2], offset); w1[2] = amd_bytealign_S (w1[0], w1[1], offset); w1[1] = amd_bytealign_S (w0[3], w1[0], offset); w1[0] = amd_bytealign_S (w0[2], w0[3], offset); w0[3] = amd_bytealign_S (w0[1], w0[2], offset); w0[2] = amd_bytealign_S (w0[0], w0[1], offset); w0[1] = amd_bytealign_S ( 0, w0[0], offset); w0[0] = 0; break; case 2: w3[3] = amd_bytealign_S (w3[0], w3[1], offset); w3[2] = amd_bytealign_S (w2[3], w3[0], offset); w3[1] = amd_bytealign_S (w2[2], w2[3], offset); w3[0] = amd_bytealign_S (w2[1], w2[2], offset); w2[3] = amd_bytealign_S (w2[0], w2[1], offset); w2[2] = amd_bytealign_S (w1[3], w2[0], offset); w2[1] = amd_bytealign_S (w1[2], w1[3], offset); w2[0] = amd_bytealign_S (w1[1], w1[2], offset); w1[3] = amd_bytealign_S (w1[0], w1[1], offset); w1[2] = amd_bytealign_S (w0[3], w1[0], offset); w1[1] = amd_bytealign_S (w0[2], w0[3], offset); w1[0] = amd_bytealign_S (w0[1], w0[2], offset); w0[3] = amd_bytealign_S (w0[0], w0[1], offset); w0[2] = amd_bytealign_S ( 0, w0[0], offset); w0[1] = 0; w0[0] = 0; break; case 3: w3[3] = amd_bytealign_S (w2[3], w3[0], offset); w3[2] = amd_bytealign_S (w2[2], w2[3], offset); w3[1] = amd_bytealign_S (w2[1], w2[2], offset); w3[0] = amd_bytealign_S (w2[0], w2[1], offset); w2[3] = amd_bytealign_S (w1[3], w2[0], offset); w2[2] = amd_bytealign_S (w1[2], w1[3], offset); w2[1] = amd_bytealign_S (w1[1], w1[2], offset); w2[0] = amd_bytealign_S (w1[0], w1[1], offset); w1[3] = amd_bytealign_S (w0[3], w1[0], offset); w1[2] = amd_bytealign_S (w0[2], w0[3], offset); w1[1] = amd_bytealign_S (w0[1], w0[2], offset); w1[0] = amd_bytealign_S (w0[0], w0[1], offset); w0[3] = amd_bytealign_S ( 0, w0[0], offset); w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 4: w3[3] = amd_bytealign_S (w2[2], w2[3], offset); w3[2] = amd_bytealign_S (w2[1], w2[2], offset); w3[1] = amd_bytealign_S (w2[0], w2[1], offset); w3[0] = amd_bytealign_S (w1[3], w2[0], offset); w2[3] = amd_bytealign_S (w1[2], w1[3], offset); w2[2] = amd_bytealign_S (w1[1], w1[2], offset); w2[1] = amd_bytealign_S (w1[0], w1[1], offset); w2[0] = amd_bytealign_S (w0[3], w1[0], offset); w1[3] = amd_bytealign_S (w0[2], w0[3], offset); w1[2] = amd_bytealign_S (w0[1], w0[2], offset); w1[1] = amd_bytealign_S (w0[0], w0[1], offset); w1[0] = amd_bytealign_S ( 0, w0[0], offset); w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 5: w3[3] = amd_bytealign_S (w2[1], w2[2], offset); w3[2] = amd_bytealign_S (w2[0], w2[1], offset); w3[1] = amd_bytealign_S (w1[3], w2[0], offset); w3[0] = amd_bytealign_S (w1[2], w1[3], offset); w2[3] = amd_bytealign_S (w1[1], w1[2], offset); w2[2] = amd_bytealign_S (w1[0], w1[1], offset); w2[1] = amd_bytealign_S (w0[3], w1[0], offset); w2[0] = amd_bytealign_S (w0[2], w0[3], offset); w1[3] = amd_bytealign_S (w0[1], w0[2], offset); w1[2] = amd_bytealign_S (w0[0], w0[1], offset); w1[1] = amd_bytealign_S ( 0, w0[0], offset); w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 6: w3[3] = amd_bytealign_S (w2[0], w2[1], offset); w3[2] = amd_bytealign_S (w1[3], w2[0], offset); w3[1] = amd_bytealign_S (w1[2], w1[3], offset); w3[0] = amd_bytealign_S (w1[1], w1[2], offset); w2[3] = amd_bytealign_S (w1[0], w1[1], offset); w2[2] = amd_bytealign_S (w0[3], w1[0], offset); w2[1] = amd_bytealign_S (w0[2], w0[3], offset); w2[0] = amd_bytealign_S (w0[1], w0[2], offset); w1[3] = amd_bytealign_S (w0[0], w0[1], offset); w1[2] = amd_bytealign_S ( 0, w0[0], offset); w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 7: w3[3] = amd_bytealign_S (w1[3], w2[0], offset); w3[2] = amd_bytealign_S (w1[2], w1[3], offset); w3[1] = amd_bytealign_S (w1[1], w1[2], offset); w3[0] = amd_bytealign_S (w1[0], w1[1], offset); w2[3] = amd_bytealign_S (w0[3], w1[0], offset); w2[2] = amd_bytealign_S (w0[2], w0[3], offset); w2[1] = amd_bytealign_S (w0[1], w0[2], offset); w2[0] = amd_bytealign_S (w0[0], w0[1], offset); w1[3] = amd_bytealign_S ( 0, w0[0], offset); w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 8: w3[3] = amd_bytealign_S (w1[2], w1[3], offset); w3[2] = amd_bytealign_S (w1[1], w1[2], offset); w3[1] = amd_bytealign_S (w1[0], w1[1], offset); w3[0] = amd_bytealign_S (w0[3], w1[0], offset); w2[3] = amd_bytealign_S (w0[2], w0[3], offset); w2[2] = amd_bytealign_S (w0[1], w0[2], offset); w2[1] = amd_bytealign_S (w0[0], w0[1], offset); w2[0] = amd_bytealign_S ( 0, w0[0], offset); w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 9: w3[3] = amd_bytealign_S (w1[1], w1[2], offset); w3[2] = amd_bytealign_S (w1[0], w1[1], offset); w3[1] = amd_bytealign_S (w0[3], w1[0], offset); w3[0] = amd_bytealign_S (w0[2], w0[3], offset); w2[3] = amd_bytealign_S (w0[1], w0[2], offset); w2[2] = amd_bytealign_S (w0[0], w0[1], offset); w2[1] = amd_bytealign_S ( 0, w0[0], offset); w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 10: w3[3] = amd_bytealign_S (w1[0], w1[1], offset); w3[2] = amd_bytealign_S (w0[3], w1[0], offset); w3[1] = amd_bytealign_S (w0[2], w0[3], offset); w3[0] = amd_bytealign_S (w0[1], w0[2], offset); w2[3] = amd_bytealign_S (w0[0], w0[1], offset); w2[2] = amd_bytealign_S ( 0, w0[0], offset); w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 11: w3[3] = amd_bytealign_S (w0[3], w1[0], offset); w3[2] = amd_bytealign_S (w0[2], w0[3], offset); w3[1] = amd_bytealign_S (w0[1], w0[2], offset); w3[0] = amd_bytealign_S (w0[0], w0[1], offset); w2[3] = amd_bytealign_S ( 0, w0[0], offset); w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 12: w3[3] = amd_bytealign_S (w0[2], w0[3], offset); w3[2] = amd_bytealign_S (w0[1], w0[2], offset); w3[1] = amd_bytealign_S (w0[0], w0[1], offset); w3[0] = amd_bytealign_S ( 0, w0[0], offset); w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 13: w3[3] = amd_bytealign_S (w0[1], w0[2], offset); w3[2] = amd_bytealign_S (w0[0], w0[1], offset); w3[1] = amd_bytealign_S ( 0, w0[0], offset); w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 14: w3[3] = amd_bytealign_S (w0[0], w0[1], offset); w3[2] = amd_bytealign_S ( 0, w0[0], offset); w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 15: w3[3] = amd_bytealign_S ( 0, w0[0], offset); w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; } #endif #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV #if defined IS_NV const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; #endif #if defined IS_AMD const int selector = 0x0706050403020100 >> ((offset & 3) * 8); #endif switch (offset_switch) { case 0: w3[3] = __byte_perm_S (w3[3], w3[2], selector); w3[2] = __byte_perm_S (w3[2], w3[1], selector); w3[1] = __byte_perm_S (w3[1], w3[0], selector); w3[0] = __byte_perm_S (w3[0], w2[3], selector); w2[3] = __byte_perm_S (w2[3], w2[2], selector); w2[2] = __byte_perm_S (w2[2], w2[1], selector); w2[1] = __byte_perm_S (w2[1], w2[0], selector); w2[0] = __byte_perm_S (w2[0], w1[3], selector); w1[3] = __byte_perm_S (w1[3], w1[2], selector); w1[2] = __byte_perm_S (w1[2], w1[1], selector); w1[1] = __byte_perm_S (w1[1], w1[0], selector); w1[0] = __byte_perm_S (w1[0], w0[3], selector); w0[3] = __byte_perm_S (w0[3], w0[2], selector); w0[2] = __byte_perm_S (w0[2], w0[1], selector); w0[1] = __byte_perm_S (w0[1], w0[0], selector); w0[0] = __byte_perm_S (w0[0], 0, selector); break; case 1: w3[3] = __byte_perm_S (w3[2], w3[1], selector); w3[2] = __byte_perm_S (w3[1], w3[0], selector); w3[1] = __byte_perm_S (w3[0], w2[3], selector); w3[0] = __byte_perm_S (w2[3], w2[2], selector); w2[3] = __byte_perm_S (w2[2], w2[1], selector); w2[2] = __byte_perm_S (w2[1], w2[0], selector); w2[1] = __byte_perm_S (w2[0], w1[3], selector); w2[0] = __byte_perm_S (w1[3], w1[2], selector); w1[3] = __byte_perm_S (w1[2], w1[1], selector); w1[2] = __byte_perm_S (w1[1], w1[0], selector); w1[1] = __byte_perm_S (w1[0], w0[3], selector); w1[0] = __byte_perm_S (w0[3], w0[2], selector); w0[3] = __byte_perm_S (w0[2], w0[1], selector); w0[2] = __byte_perm_S (w0[1], w0[0], selector); w0[1] = __byte_perm_S (w0[0], 0, selector); w0[0] = 0; break; case 2: w3[3] = __byte_perm_S (w3[1], w3[0], selector); w3[2] = __byte_perm_S (w3[0], w2[3], selector); w3[1] = __byte_perm_S (w2[3], w2[2], selector); w3[0] = __byte_perm_S (w2[2], w2[1], selector); w2[3] = __byte_perm_S (w2[1], w2[0], selector); w2[2] = __byte_perm_S (w2[0], w1[3], selector); w2[1] = __byte_perm_S (w1[3], w1[2], selector); w2[0] = __byte_perm_S (w1[2], w1[1], selector); w1[3] = __byte_perm_S (w1[1], w1[0], selector); w1[2] = __byte_perm_S (w1[0], w0[3], selector); w1[1] = __byte_perm_S (w0[3], w0[2], selector); w1[0] = __byte_perm_S (w0[2], w0[1], selector); w0[3] = __byte_perm_S (w0[1], w0[0], selector); w0[2] = __byte_perm_S (w0[0], 0, selector); w0[1] = 0; w0[0] = 0; break; case 3: w3[3] = __byte_perm_S (w3[0], w2[3], selector); w3[2] = __byte_perm_S (w2[3], w2[2], selector); w3[1] = __byte_perm_S (w2[2], w2[1], selector); w3[0] = __byte_perm_S (w2[1], w2[0], selector); w2[3] = __byte_perm_S (w2[0], w1[3], selector); w2[2] = __byte_perm_S (w1[3], w1[2], selector); w2[1] = __byte_perm_S (w1[2], w1[1], selector); w2[0] = __byte_perm_S (w1[1], w1[0], selector); w1[3] = __byte_perm_S (w1[0], w0[3], selector); w1[2] = __byte_perm_S (w0[3], w0[2], selector); w1[1] = __byte_perm_S (w0[2], w0[1], selector); w1[0] = __byte_perm_S (w0[1], w0[0], selector); w0[3] = __byte_perm_S (w0[0], 0, selector); w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 4: w3[3] = __byte_perm_S (w2[3], w2[2], selector); w3[2] = __byte_perm_S (w2[2], w2[1], selector); w3[1] = __byte_perm_S (w2[1], w2[0], selector); w3[0] = __byte_perm_S (w2[0], w1[3], selector); w2[3] = __byte_perm_S (w1[3], w1[2], selector); w2[2] = __byte_perm_S (w1[2], w1[1], selector); w2[1] = __byte_perm_S (w1[1], w1[0], selector); w2[0] = __byte_perm_S (w1[0], w0[3], selector); w1[3] = __byte_perm_S (w0[3], w0[2], selector); w1[2] = __byte_perm_S (w0[2], w0[1], selector); w1[1] = __byte_perm_S (w0[1], w0[0], selector); w1[0] = __byte_perm_S (w0[0], 0, selector); w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 5: w3[3] = __byte_perm_S (w2[2], w2[1], selector); w3[2] = __byte_perm_S (w2[1], w2[0], selector); w3[1] = __byte_perm_S (w2[0], w1[3], selector); w3[0] = __byte_perm_S (w1[3], w1[2], selector); w2[3] = __byte_perm_S (w1[2], w1[1], selector); w2[2] = __byte_perm_S (w1[1], w1[0], selector); w2[1] = __byte_perm_S (w1[0], w0[3], selector); w2[0] = __byte_perm_S (w0[3], w0[2], selector); w1[3] = __byte_perm_S (w0[2], w0[1], selector); w1[2] = __byte_perm_S (w0[1], w0[0], selector); w1[1] = __byte_perm_S (w0[0], 0, selector); w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 6: w3[3] = __byte_perm_S (w2[1], w2[0], selector); w3[2] = __byte_perm_S (w2[0], w1[3], selector); w3[1] = __byte_perm_S (w1[3], w1[2], selector); w3[0] = __byte_perm_S (w1[2], w1[1], selector); w2[3] = __byte_perm_S (w1[1], w1[0], selector); w2[2] = __byte_perm_S (w1[0], w0[3], selector); w2[1] = __byte_perm_S (w0[3], w0[2], selector); w2[0] = __byte_perm_S (w0[2], w0[1], selector); w1[3] = __byte_perm_S (w0[1], w0[0], selector); w1[2] = __byte_perm_S (w0[0], 0, selector); w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 7: w3[3] = __byte_perm_S (w2[0], w1[3], selector); w3[2] = __byte_perm_S (w1[3], w1[2], selector); w3[1] = __byte_perm_S (w1[2], w1[1], selector); w3[0] = __byte_perm_S (w1[1], w1[0], selector); w2[3] = __byte_perm_S (w1[0], w0[3], selector); w2[2] = __byte_perm_S (w0[3], w0[2], selector); w2[1] = __byte_perm_S (w0[2], w0[1], selector); w2[0] = __byte_perm_S (w0[1], w0[0], selector); w1[3] = __byte_perm_S (w0[0], 0, selector); w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 8: w3[3] = __byte_perm_S (w1[3], w1[2], selector); w3[2] = __byte_perm_S (w1[2], w1[1], selector); w3[1] = __byte_perm_S (w1[1], w1[0], selector); w3[0] = __byte_perm_S (w1[0], w0[3], selector); w2[3] = __byte_perm_S (w0[3], w0[2], selector); w2[2] = __byte_perm_S (w0[2], w0[1], selector); w2[1] = __byte_perm_S (w0[1], w0[0], selector); w2[0] = __byte_perm_S (w0[0], 0, selector); w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 9: w3[3] = __byte_perm_S (w1[2], w1[1], selector); w3[2] = __byte_perm_S (w1[1], w1[0], selector); w3[1] = __byte_perm_S (w1[0], w0[3], selector); w3[0] = __byte_perm_S (w0[3], w0[2], selector); w2[3] = __byte_perm_S (w0[2], w0[1], selector); w2[2] = __byte_perm_S (w0[1], w0[0], selector); w2[1] = __byte_perm_S (w0[0], 0, selector); w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 10: w3[3] = __byte_perm_S (w1[1], w1[0], selector); w3[2] = __byte_perm_S (w1[0], w0[3], selector); w3[1] = __byte_perm_S (w0[3], w0[2], selector); w3[0] = __byte_perm_S (w0[2], w0[1], selector); w2[3] = __byte_perm_S (w0[1], w0[0], selector); w2[2] = __byte_perm_S (w0[0], 0, selector); w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 11: w3[3] = __byte_perm_S (w1[0], w0[3], selector); w3[2] = __byte_perm_S (w0[3], w0[2], selector); w3[1] = __byte_perm_S (w0[2], w0[1], selector); w3[0] = __byte_perm_S (w0[1], w0[0], selector); w2[3] = __byte_perm_S (w0[0], 0, selector); w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 12: w3[3] = __byte_perm_S (w0[3], w0[2], selector); w3[2] = __byte_perm_S (w0[2], w0[1], selector); w3[1] = __byte_perm_S (w0[1], w0[0], selector); w3[0] = __byte_perm_S (w0[0], 0, selector); w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 13: w3[3] = __byte_perm_S (w0[2], w0[1], selector); w3[2] = __byte_perm_S (w0[1], w0[0], selector); w3[1] = __byte_perm_S (w0[0], 0, selector); w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 14: w3[3] = __byte_perm_S (w0[1], w0[0], selector); w3[2] = __byte_perm_S (w0[0], 0, selector); w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 15: w3[3] = __byte_perm_S (w0[0], 0, selector); w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; } #endif } DECLSPEC void switch_buffer_by_offset_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 c0[4], u32 c1[4], u32 c2[4], u32 c3[4], const u32 offset) { const int offset_switch = offset / 4; #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC switch (offset_switch) { case 0: c0[0] = amd_bytealign_S (w3[3], 0, offset); w3[3] = amd_bytealign_S (w3[2], w3[3], offset); w3[2] = amd_bytealign_S (w3[1], w3[2], offset); w3[1] = amd_bytealign_S (w3[0], w3[1], offset); w3[0] = amd_bytealign_S (w2[3], w3[0], offset); w2[3] = amd_bytealign_S (w2[2], w2[3], offset); w2[2] = amd_bytealign_S (w2[1], w2[2], offset); w2[1] = amd_bytealign_S (w2[0], w2[1], offset); w2[0] = amd_bytealign_S (w1[3], w2[0], offset); w1[3] = amd_bytealign_S (w1[2], w1[3], offset); w1[2] = amd_bytealign_S (w1[1], w1[2], offset); w1[1] = amd_bytealign_S (w1[0], w1[1], offset); w1[0] = amd_bytealign_S (w0[3], w1[0], offset); w0[3] = amd_bytealign_S (w0[2], w0[3], offset); w0[2] = amd_bytealign_S (w0[1], w0[2], offset); w0[1] = amd_bytealign_S (w0[0], w0[1], offset); w0[0] = amd_bytealign_S ( 0, w0[0], offset); break; case 1: c0[1] = amd_bytealign_S (w3[3], 0, offset); c0[0] = amd_bytealign_S (w3[2], w3[3], offset); w3[3] = amd_bytealign_S (w3[1], w3[2], offset); w3[2] = amd_bytealign_S (w3[0], w3[1], offset); w3[1] = amd_bytealign_S (w2[3], w3[0], offset); w3[0] = amd_bytealign_S (w2[2], w2[3], offset); w2[3] = amd_bytealign_S (w2[1], w2[2], offset); w2[2] = amd_bytealign_S (w2[0], w2[1], offset); w2[1] = amd_bytealign_S (w1[3], w2[0], offset); w2[0] = amd_bytealign_S (w1[2], w1[3], offset); w1[3] = amd_bytealign_S (w1[1], w1[2], offset); w1[2] = amd_bytealign_S (w1[0], w1[1], offset); w1[1] = amd_bytealign_S (w0[3], w1[0], offset); w1[0] = amd_bytealign_S (w0[2], w0[3], offset); w0[3] = amd_bytealign_S (w0[1], w0[2], offset); w0[2] = amd_bytealign_S (w0[0], w0[1], offset); w0[1] = amd_bytealign_S ( 0, w0[0], offset); w0[0] = 0; break; case 2: c0[2] = amd_bytealign_S (w3[3], 0, offset); c0[1] = amd_bytealign_S (w3[2], w3[3], offset); c0[0] = amd_bytealign_S (w3[1], w3[2], offset); w3[3] = amd_bytealign_S (w3[0], w3[1], offset); w3[2] = amd_bytealign_S (w2[3], w3[0], offset); w3[1] = amd_bytealign_S (w2[2], w2[3], offset); w3[0] = amd_bytealign_S (w2[1], w2[2], offset); w2[3] = amd_bytealign_S (w2[0], w2[1], offset); w2[2] = amd_bytealign_S (w1[3], w2[0], offset); w2[1] = amd_bytealign_S (w1[2], w1[3], offset); w2[0] = amd_bytealign_S (w1[1], w1[2], offset); w1[3] = amd_bytealign_S (w1[0], w1[1], offset); w1[2] = amd_bytealign_S (w0[3], w1[0], offset); w1[1] = amd_bytealign_S (w0[2], w0[3], offset); w1[0] = amd_bytealign_S (w0[1], w0[2], offset); w0[3] = amd_bytealign_S (w0[0], w0[1], offset); w0[2] = amd_bytealign_S ( 0, w0[0], offset); w0[1] = 0; w0[0] = 0; break; case 3: c0[3] = amd_bytealign_S (w3[3], 0, offset); c0[2] = amd_bytealign_S (w3[2], w3[3], offset); c0[1] = amd_bytealign_S (w3[1], w3[2], offset); c0[0] = amd_bytealign_S (w3[0], w3[1], offset); w3[3] = amd_bytealign_S (w2[3], w3[0], offset); w3[2] = amd_bytealign_S (w2[2], w2[3], offset); w3[1] = amd_bytealign_S (w2[1], w2[2], offset); w3[0] = amd_bytealign_S (w2[0], w2[1], offset); w2[3] = amd_bytealign_S (w1[3], w2[0], offset); w2[2] = amd_bytealign_S (w1[2], w1[3], offset); w2[1] = amd_bytealign_S (w1[1], w1[2], offset); w2[0] = amd_bytealign_S (w1[0], w1[1], offset); w1[3] = amd_bytealign_S (w0[3], w1[0], offset); w1[2] = amd_bytealign_S (w0[2], w0[3], offset); w1[1] = amd_bytealign_S (w0[1], w0[2], offset); w1[0] = amd_bytealign_S (w0[0], w0[1], offset); w0[3] = amd_bytealign_S ( 0, w0[0], offset); w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 4: c1[0] = amd_bytealign_S (w3[3], 0, offset); c0[3] = amd_bytealign_S (w3[2], w3[3], offset); c0[2] = amd_bytealign_S (w3[1], w3[2], offset); c0[1] = amd_bytealign_S (w3[0], w3[1], offset); c0[0] = amd_bytealign_S (w2[3], w3[0], offset); w3[3] = amd_bytealign_S (w2[2], w2[3], offset); w3[2] = amd_bytealign_S (w2[1], w2[2], offset); w3[1] = amd_bytealign_S (w2[0], w2[1], offset); w3[0] = amd_bytealign_S (w1[3], w2[0], offset); w2[3] = amd_bytealign_S (w1[2], w1[3], offset); w2[2] = amd_bytealign_S (w1[1], w1[2], offset); w2[1] = amd_bytealign_S (w1[0], w1[1], offset); w2[0] = amd_bytealign_S (w0[3], w1[0], offset); w1[3] = amd_bytealign_S (w0[2], w0[3], offset); w1[2] = amd_bytealign_S (w0[1], w0[2], offset); w1[1] = amd_bytealign_S (w0[0], w0[1], offset); w1[0] = amd_bytealign_S ( 0, w0[0], offset); w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 5: c1[1] = amd_bytealign_S (w3[3], 0, offset); c1[0] = amd_bytealign_S (w3[2], w3[3], offset); c0[3] = amd_bytealign_S (w3[1], w3[2], offset); c0[2] = amd_bytealign_S (w3[0], w3[1], offset); c0[1] = amd_bytealign_S (w2[3], w3[0], offset); c0[0] = amd_bytealign_S (w2[2], w2[3], offset); w3[3] = amd_bytealign_S (w2[1], w2[2], offset); w3[2] = amd_bytealign_S (w2[0], w2[1], offset); w3[1] = amd_bytealign_S (w1[3], w2[0], offset); w3[0] = amd_bytealign_S (w1[2], w1[3], offset); w2[3] = amd_bytealign_S (w1[1], w1[2], offset); w2[2] = amd_bytealign_S (w1[0], w1[1], offset); w2[1] = amd_bytealign_S (w0[3], w1[0], offset); w2[0] = amd_bytealign_S (w0[2], w0[3], offset); w1[3] = amd_bytealign_S (w0[1], w0[2], offset); w1[2] = amd_bytealign_S (w0[0], w0[1], offset); w1[1] = amd_bytealign_S ( 0, w0[0], offset); w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 6: c1[2] = amd_bytealign_S (w3[3], 0, offset); c1[1] = amd_bytealign_S (w3[2], w3[3], offset); c1[0] = amd_bytealign_S (w3[1], w3[2], offset); c0[3] = amd_bytealign_S (w3[0], w3[1], offset); c0[2] = amd_bytealign_S (w2[3], w3[0], offset); c0[1] = amd_bytealign_S (w2[2], w2[3], offset); c0[0] = amd_bytealign_S (w2[1], w2[2], offset); w3[3] = amd_bytealign_S (w2[0], w2[1], offset); w3[2] = amd_bytealign_S (w1[3], w2[0], offset); w3[1] = amd_bytealign_S (w1[2], w1[3], offset); w3[0] = amd_bytealign_S (w1[1], w1[2], offset); w2[3] = amd_bytealign_S (w1[0], w1[1], offset); w2[2] = amd_bytealign_S (w0[3], w1[0], offset); w2[1] = amd_bytealign_S (w0[2], w0[3], offset); w2[0] = amd_bytealign_S (w0[1], w0[2], offset); w1[3] = amd_bytealign_S (w0[0], w0[1], offset); w1[2] = amd_bytealign_S ( 0, w0[0], offset); w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 7: c1[3] = amd_bytealign_S (w3[3], 0, offset); c1[2] = amd_bytealign_S (w3[2], w3[3], offset); c1[1] = amd_bytealign_S (w3[1], w3[2], offset); c1[0] = amd_bytealign_S (w3[0], w3[1], offset); c0[3] = amd_bytealign_S (w2[3], w3[0], offset); c0[2] = amd_bytealign_S (w2[2], w2[3], offset); c0[1] = amd_bytealign_S (w2[1], w2[2], offset); c0[0] = amd_bytealign_S (w2[0], w2[1], offset); w3[3] = amd_bytealign_S (w1[3], w2[0], offset); w3[2] = amd_bytealign_S (w1[2], w1[3], offset); w3[1] = amd_bytealign_S (w1[1], w1[2], offset); w3[0] = amd_bytealign_S (w1[0], w1[1], offset); w2[3] = amd_bytealign_S (w0[3], w1[0], offset); w2[2] = amd_bytealign_S (w0[2], w0[3], offset); w2[1] = amd_bytealign_S (w0[1], w0[2], offset); w2[0] = amd_bytealign_S (w0[0], w0[1], offset); w1[3] = amd_bytealign_S ( 0, w0[0], offset); w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 8: c2[0] = amd_bytealign_S (w3[3], 0, offset); c1[3] = amd_bytealign_S (w3[2], w3[3], offset); c1[2] = amd_bytealign_S (w3[1], w3[2], offset); c1[1] = amd_bytealign_S (w3[0], w3[1], offset); c1[0] = amd_bytealign_S (w2[3], w3[0], offset); c0[3] = amd_bytealign_S (w2[2], w2[3], offset); c0[2] = amd_bytealign_S (w2[1], w2[2], offset); c0[1] = amd_bytealign_S (w2[0], w2[1], offset); c0[0] = amd_bytealign_S (w1[3], w2[0], offset); w3[3] = amd_bytealign_S (w1[2], w1[3], offset); w3[2] = amd_bytealign_S (w1[1], w1[2], offset); w3[1] = amd_bytealign_S (w1[0], w1[1], offset); w3[0] = amd_bytealign_S (w0[3], w1[0], offset); w2[3] = amd_bytealign_S (w0[2], w0[3], offset); w2[2] = amd_bytealign_S (w0[1], w0[2], offset); w2[1] = amd_bytealign_S (w0[0], w0[1], offset); w2[0] = amd_bytealign_S ( 0, w0[0], offset); w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 9: c2[1] = amd_bytealign_S (w3[3], 0, offset); c2[0] = amd_bytealign_S (w3[2], w3[3], offset); c1[3] = amd_bytealign_S (w3[1], w3[2], offset); c1[2] = amd_bytealign_S (w3[0], w3[1], offset); c1[1] = amd_bytealign_S (w2[3], w3[0], offset); c1[0] = amd_bytealign_S (w2[2], w2[3], offset); c0[3] = amd_bytealign_S (w2[1], w2[2], offset); c0[2] = amd_bytealign_S (w2[0], w2[1], offset); c0[1] = amd_bytealign_S (w1[3], w2[0], offset); c0[0] = amd_bytealign_S (w1[2], w1[3], offset); w3[3] = amd_bytealign_S (w1[1], w1[2], offset); w3[2] = amd_bytealign_S (w1[0], w1[1], offset); w3[1] = amd_bytealign_S (w0[3], w1[0], offset); w3[0] = amd_bytealign_S (w0[2], w0[3], offset); w2[3] = amd_bytealign_S (w0[1], w0[2], offset); w2[2] = amd_bytealign_S (w0[0], w0[1], offset); w2[1] = amd_bytealign_S ( 0, w0[0], offset); w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 10: c2[2] = amd_bytealign_S (w3[3], 0, offset); c2[1] = amd_bytealign_S (w3[2], w3[3], offset); c2[0] = amd_bytealign_S (w3[1], w3[2], offset); c1[3] = amd_bytealign_S (w3[0], w3[1], offset); c1[2] = amd_bytealign_S (w2[3], w3[0], offset); c1[1] = amd_bytealign_S (w2[2], w2[3], offset); c1[0] = amd_bytealign_S (w2[1], w2[2], offset); c0[3] = amd_bytealign_S (w2[0], w2[1], offset); c0[2] = amd_bytealign_S (w1[3], w2[0], offset); c0[1] = amd_bytealign_S (w1[2], w1[3], offset); c0[0] = amd_bytealign_S (w1[1], w1[2], offset); w3[3] = amd_bytealign_S (w1[0], w1[1], offset); w3[2] = amd_bytealign_S (w0[3], w1[0], offset); w3[1] = amd_bytealign_S (w0[2], w0[3], offset); w3[0] = amd_bytealign_S (w0[1], w0[2], offset); w2[3] = amd_bytealign_S (w0[0], w0[1], offset); w2[2] = amd_bytealign_S ( 0, w0[0], offset); w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 11: c2[3] = amd_bytealign_S (w3[3], 0, offset); c2[2] = amd_bytealign_S (w3[2], w3[3], offset); c2[1] = amd_bytealign_S (w3[1], w3[2], offset); c2[0] = amd_bytealign_S (w3[0], w3[1], offset); c1[3] = amd_bytealign_S (w2[3], w3[0], offset); c1[2] = amd_bytealign_S (w2[2], w2[3], offset); c1[1] = amd_bytealign_S (w2[1], w2[2], offset); c1[0] = amd_bytealign_S (w2[0], w2[1], offset); c0[3] = amd_bytealign_S (w1[3], w2[0], offset); c0[2] = amd_bytealign_S (w1[2], w1[3], offset); c0[1] = amd_bytealign_S (w1[1], w1[2], offset); c0[0] = amd_bytealign_S (w1[0], w1[1], offset); w3[3] = amd_bytealign_S (w0[3], w1[0], offset); w3[2] = amd_bytealign_S (w0[2], w0[3], offset); w3[1] = amd_bytealign_S (w0[1], w0[2], offset); w3[0] = amd_bytealign_S (w0[0], w0[1], offset); w2[3] = amd_bytealign_S ( 0, w0[0], offset); w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 12: c3[0] = amd_bytealign_S (w3[3], 0, offset); c2[3] = amd_bytealign_S (w3[2], w3[3], offset); c2[2] = amd_bytealign_S (w3[1], w3[2], offset); c2[1] = amd_bytealign_S (w3[0], w3[1], offset); c2[0] = amd_bytealign_S (w2[3], w3[0], offset); c1[3] = amd_bytealign_S (w2[2], w2[3], offset); c1[2] = amd_bytealign_S (w2[1], w2[2], offset); c1[1] = amd_bytealign_S (w2[0], w2[1], offset); c1[0] = amd_bytealign_S (w1[3], w2[0], offset); c0[3] = amd_bytealign_S (w1[2], w1[3], offset); c0[2] = amd_bytealign_S (w1[1], w1[2], offset); c0[1] = amd_bytealign_S (w1[0], w1[1], offset); c0[0] = amd_bytealign_S (w0[3], w1[0], offset); w3[3] = amd_bytealign_S (w0[2], w0[3], offset); w3[2] = amd_bytealign_S (w0[1], w0[2], offset); w3[1] = amd_bytealign_S (w0[0], w0[1], offset); w3[0] = amd_bytealign_S ( 0, w0[0], offset); w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 13: c3[1] = amd_bytealign_S (w3[3], 0, offset); c3[0] = amd_bytealign_S (w3[2], w3[3], offset); c2[3] = amd_bytealign_S (w3[1], w3[2], offset); c2[2] = amd_bytealign_S (w3[0], w3[1], offset); c2[1] = amd_bytealign_S (w2[3], w3[0], offset); c2[0] = amd_bytealign_S (w2[2], w2[3], offset); c1[3] = amd_bytealign_S (w2[1], w2[2], offset); c1[2] = amd_bytealign_S (w2[0], w2[1], offset); c1[1] = amd_bytealign_S (w1[3], w2[0], offset); c1[0] = amd_bytealign_S (w1[2], w1[3], offset); c0[3] = amd_bytealign_S (w1[1], w1[2], offset); c0[2] = amd_bytealign_S (w1[0], w1[1], offset); c0[1] = amd_bytealign_S (w0[3], w1[0], offset); c0[0] = amd_bytealign_S (w0[2], w0[3], offset); w3[3] = amd_bytealign_S (w0[1], w0[2], offset); w3[2] = amd_bytealign_S (w0[0], w0[1], offset); w3[1] = amd_bytealign_S ( 0, w0[0], offset); w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 14: c3[2] = amd_bytealign_S (w3[3], 0, offset); c3[1] = amd_bytealign_S (w3[2], w3[3], offset); c3[0] = amd_bytealign_S (w3[1], w3[2], offset); c2[3] = amd_bytealign_S (w3[0], w3[1], offset); c2[2] = amd_bytealign_S (w2[3], w3[0], offset); c2[1] = amd_bytealign_S (w2[2], w2[3], offset); c2[0] = amd_bytealign_S (w2[1], w2[2], offset); c1[3] = amd_bytealign_S (w2[0], w2[1], offset); c1[2] = amd_bytealign_S (w1[3], w2[0], offset); c1[1] = amd_bytealign_S (w1[2], w1[3], offset); c1[0] = amd_bytealign_S (w1[1], w1[2], offset); c0[3] = amd_bytealign_S (w1[0], w1[1], offset); c0[2] = amd_bytealign_S (w0[3], w1[0], offset); c0[1] = amd_bytealign_S (w0[2], w0[3], offset); c0[0] = amd_bytealign_S (w0[1], w0[2], offset); w3[3] = amd_bytealign_S (w0[0], w0[1], offset); w3[2] = amd_bytealign_S ( 0, w0[0], offset); w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 15: c3[3] = amd_bytealign_S (w3[3], 0, offset); c3[2] = amd_bytealign_S (w3[2], w3[3], offset); c3[1] = amd_bytealign_S (w3[1], w3[2], offset); c3[0] = amd_bytealign_S (w3[0], w3[1], offset); c2[3] = amd_bytealign_S (w2[3], w3[0], offset); c2[2] = amd_bytealign_S (w2[2], w2[3], offset); c2[1] = amd_bytealign_S (w2[1], w2[2], offset); c2[0] = amd_bytealign_S (w2[0], w2[1], offset); c1[3] = amd_bytealign_S (w1[3], w2[0], offset); c1[2] = amd_bytealign_S (w1[2], w1[3], offset); c1[1] = amd_bytealign_S (w1[1], w1[2], offset); c1[0] = amd_bytealign_S (w1[0], w1[1], offset); c0[3] = amd_bytealign_S (w0[3], w1[0], offset); c0[2] = amd_bytealign_S (w0[2], w0[3], offset); c0[1] = amd_bytealign_S (w0[1], w0[2], offset); c0[0] = amd_bytealign_S (w0[0], w0[1], offset); w3[3] = amd_bytealign_S ( 0, w0[0], offset); w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; } #endif #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV #if defined IS_NV const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; #endif #if defined IS_AMD const int selector = 0x0706050403020100 >> ((offset & 3) * 8); #endif switch (offset_switch) { case 0: c0[0] = __byte_perm_S ( 0, w3[3], selector); w3[3] = __byte_perm_S (w3[3], w3[2], selector); w3[2] = __byte_perm_S (w3[2], w3[1], selector); w3[1] = __byte_perm_S (w3[1], w3[0], selector); w3[0] = __byte_perm_S (w3[0], w2[3], selector); w2[3] = __byte_perm_S (w2[3], w2[2], selector); w2[2] = __byte_perm_S (w2[2], w2[1], selector); w2[1] = __byte_perm_S (w2[1], w2[0], selector); w2[0] = __byte_perm_S (w2[0], w1[3], selector); w1[3] = __byte_perm_S (w1[3], w1[2], selector); w1[2] = __byte_perm_S (w1[2], w1[1], selector); w1[1] = __byte_perm_S (w1[1], w1[0], selector); w1[0] = __byte_perm_S (w1[0], w0[3], selector); w0[3] = __byte_perm_S (w0[3], w0[2], selector); w0[2] = __byte_perm_S (w0[2], w0[1], selector); w0[1] = __byte_perm_S (w0[1], w0[0], selector); w0[0] = __byte_perm_S (w0[0], 0, selector); break; case 1: c0[1] = __byte_perm_S ( 0, w3[3], selector); c0[0] = __byte_perm_S (w3[3], w3[2], selector); w3[3] = __byte_perm_S (w3[2], w3[1], selector); w3[2] = __byte_perm_S (w3[1], w3[0], selector); w3[1] = __byte_perm_S (w3[0], w2[3], selector); w3[0] = __byte_perm_S (w2[3], w2[2], selector); w2[3] = __byte_perm_S (w2[2], w2[1], selector); w2[2] = __byte_perm_S (w2[1], w2[0], selector); w2[1] = __byte_perm_S (w2[0], w1[3], selector); w2[0] = __byte_perm_S (w1[3], w1[2], selector); w1[3] = __byte_perm_S (w1[2], w1[1], selector); w1[2] = __byte_perm_S (w1[1], w1[0], selector); w1[1] = __byte_perm_S (w1[0], w0[3], selector); w1[0] = __byte_perm_S (w0[3], w0[2], selector); w0[3] = __byte_perm_S (w0[2], w0[1], selector); w0[2] = __byte_perm_S (w0[1], w0[0], selector); w0[1] = __byte_perm_S (w0[0], 0, selector); w0[0] = 0; break; case 2: c0[2] = __byte_perm_S ( 0, w3[3], selector); c0[1] = __byte_perm_S (w3[3], w3[2], selector); c0[0] = __byte_perm_S (w3[2], w3[1], selector); w3[3] = __byte_perm_S (w3[1], w3[0], selector); w3[2] = __byte_perm_S (w3[0], w2[3], selector); w3[1] = __byte_perm_S (w2[3], w2[2], selector); w3[0] = __byte_perm_S (w2[2], w2[1], selector); w2[3] = __byte_perm_S (w2[1], w2[0], selector); w2[2] = __byte_perm_S (w2[0], w1[3], selector); w2[1] = __byte_perm_S (w1[3], w1[2], selector); w2[0] = __byte_perm_S (w1[2], w1[1], selector); w1[3] = __byte_perm_S (w1[1], w1[0], selector); w1[2] = __byte_perm_S (w1[0], w0[3], selector); w1[1] = __byte_perm_S (w0[3], w0[2], selector); w1[0] = __byte_perm_S (w0[2], w0[1], selector); w0[3] = __byte_perm_S (w0[1], w0[0], selector); w0[2] = __byte_perm_S (w0[0], 0, selector); w0[1] = 0; w0[0] = 0; break; case 3: c0[3] = __byte_perm_S ( 0, w3[3], selector); c0[2] = __byte_perm_S (w3[3], w3[2], selector); c0[1] = __byte_perm_S (w3[2], w3[1], selector); c0[0] = __byte_perm_S (w3[1], w3[0], selector); w3[3] = __byte_perm_S (w3[0], w2[3], selector); w3[2] = __byte_perm_S (w2[3], w2[2], selector); w3[1] = __byte_perm_S (w2[2], w2[1], selector); w3[0] = __byte_perm_S (w2[1], w2[0], selector); w2[3] = __byte_perm_S (w2[0], w1[3], selector); w2[2] = __byte_perm_S (w1[3], w1[2], selector); w2[1] = __byte_perm_S (w1[2], w1[1], selector); w2[0] = __byte_perm_S (w1[1], w1[0], selector); w1[3] = __byte_perm_S (w1[0], w0[3], selector); w1[2] = __byte_perm_S (w0[3], w0[2], selector); w1[1] = __byte_perm_S (w0[2], w0[1], selector); w1[0] = __byte_perm_S (w0[1], w0[0], selector); w0[3] = __byte_perm_S (w0[0], 0, selector); w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 4: c1[0] = __byte_perm_S ( 0, w3[3], selector); c0[3] = __byte_perm_S (w3[3], w3[2], selector); c0[2] = __byte_perm_S (w3[2], w3[1], selector); c0[1] = __byte_perm_S (w3[1], w3[0], selector); c0[0] = __byte_perm_S (w3[0], w2[3], selector); w3[3] = __byte_perm_S (w2[3], w2[2], selector); w3[2] = __byte_perm_S (w2[2], w2[1], selector); w3[1] = __byte_perm_S (w2[1], w2[0], selector); w3[0] = __byte_perm_S (w2[0], w1[3], selector); w2[3] = __byte_perm_S (w1[3], w1[2], selector); w2[2] = __byte_perm_S (w1[2], w1[1], selector); w2[1] = __byte_perm_S (w1[1], w1[0], selector); w2[0] = __byte_perm_S (w1[0], w0[3], selector); w1[3] = __byte_perm_S (w0[3], w0[2], selector); w1[2] = __byte_perm_S (w0[2], w0[1], selector); w1[1] = __byte_perm_S (w0[1], w0[0], selector); w1[0] = __byte_perm_S (w0[0], 0, selector); w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 5: c1[1] = __byte_perm_S ( 0, w3[3], selector); c1[0] = __byte_perm_S (w3[3], w3[2], selector); c0[3] = __byte_perm_S (w3[2], w3[1], selector); c0[2] = __byte_perm_S (w3[1], w3[0], selector); c0[1] = __byte_perm_S (w3[0], w2[3], selector); c0[0] = __byte_perm_S (w2[3], w2[2], selector); w3[3] = __byte_perm_S (w2[2], w2[1], selector); w3[2] = __byte_perm_S (w2[1], w2[0], selector); w3[1] = __byte_perm_S (w2[0], w1[3], selector); w3[0] = __byte_perm_S (w1[3], w1[2], selector); w2[3] = __byte_perm_S (w1[2], w1[1], selector); w2[2] = __byte_perm_S (w1[1], w1[0], selector); w2[1] = __byte_perm_S (w1[0], w0[3], selector); w2[0] = __byte_perm_S (w0[3], w0[2], selector); w1[3] = __byte_perm_S (w0[2], w0[1], selector); w1[2] = __byte_perm_S (w0[1], w0[0], selector); w1[1] = __byte_perm_S (w0[0], 0, selector); w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 6: c1[2] = __byte_perm_S ( 0, w3[3], selector); c1[1] = __byte_perm_S (w3[3], w3[2], selector); c1[0] = __byte_perm_S (w3[2], w3[1], selector); c0[3] = __byte_perm_S (w3[1], w3[0], selector); c0[2] = __byte_perm_S (w3[0], w2[3], selector); c0[1] = __byte_perm_S (w2[3], w2[2], selector); c0[0] = __byte_perm_S (w2[2], w2[1], selector); w3[3] = __byte_perm_S (w2[1], w2[0], selector); w3[2] = __byte_perm_S (w2[0], w1[3], selector); w3[1] = __byte_perm_S (w1[3], w1[2], selector); w3[0] = __byte_perm_S (w1[2], w1[1], selector); w2[3] = __byte_perm_S (w1[1], w1[0], selector); w2[2] = __byte_perm_S (w1[0], w0[3], selector); w2[1] = __byte_perm_S (w0[3], w0[2], selector); w2[0] = __byte_perm_S (w0[2], w0[1], selector); w1[3] = __byte_perm_S (w0[1], w0[0], selector); w1[2] = __byte_perm_S (w0[0], 0, selector); w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 7: c1[3] = __byte_perm_S ( 0, w3[3], selector); c1[2] = __byte_perm_S (w3[3], w3[2], selector); c1[1] = __byte_perm_S (w3[2], w3[1], selector); c1[0] = __byte_perm_S (w3[1], w3[0], selector); c0[3] = __byte_perm_S (w3[0], w2[3], selector); c0[2] = __byte_perm_S (w2[3], w2[2], selector); c0[1] = __byte_perm_S (w2[2], w2[1], selector); c0[0] = __byte_perm_S (w2[1], w2[0], selector); w3[3] = __byte_perm_S (w2[0], w1[3], selector); w3[2] = __byte_perm_S (w1[3], w1[2], selector); w3[1] = __byte_perm_S (w1[2], w1[1], selector); w3[0] = __byte_perm_S (w1[1], w1[0], selector); w2[3] = __byte_perm_S (w1[0], w0[3], selector); w2[2] = __byte_perm_S (w0[3], w0[2], selector); w2[1] = __byte_perm_S (w0[2], w0[1], selector); w2[0] = __byte_perm_S (w0[1], w0[0], selector); w1[3] = __byte_perm_S (w0[0], 0, selector); w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 8: c2[0] = __byte_perm_S ( 0, w3[3], selector); c1[3] = __byte_perm_S (w3[3], w3[2], selector); c1[2] = __byte_perm_S (w3[2], w3[1], selector); c1[1] = __byte_perm_S (w3[1], w3[0], selector); c1[0] = __byte_perm_S (w3[0], w2[3], selector); c0[3] = __byte_perm_S (w2[3], w2[2], selector); c0[2] = __byte_perm_S (w2[2], w2[1], selector); c0[1] = __byte_perm_S (w2[1], w2[0], selector); c0[0] = __byte_perm_S (w2[0], w1[3], selector); w3[3] = __byte_perm_S (w1[3], w1[2], selector); w3[2] = __byte_perm_S (w1[2], w1[1], selector); w3[1] = __byte_perm_S (w1[1], w1[0], selector); w3[0] = __byte_perm_S (w1[0], w0[3], selector); w2[3] = __byte_perm_S (w0[3], w0[2], selector); w2[2] = __byte_perm_S (w0[2], w0[1], selector); w2[1] = __byte_perm_S (w0[1], w0[0], selector); w2[0] = __byte_perm_S (w0[0], 0, selector); w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 9: c2[1] = __byte_perm_S ( 0, w3[3], selector); c2[0] = __byte_perm_S (w3[3], w3[2], selector); c1[3] = __byte_perm_S (w3[2], w3[1], selector); c1[2] = __byte_perm_S (w3[1], w3[0], selector); c1[1] = __byte_perm_S (w3[0], w2[3], selector); c1[0] = __byte_perm_S (w2[3], w2[2], selector); c0[3] = __byte_perm_S (w2[2], w2[1], selector); c0[2] = __byte_perm_S (w2[1], w2[0], selector); c0[1] = __byte_perm_S (w2[0], w1[3], selector); c0[0] = __byte_perm_S (w1[3], w1[2], selector); w3[3] = __byte_perm_S (w1[2], w1[1], selector); w3[2] = __byte_perm_S (w1[1], w1[0], selector); w3[1] = __byte_perm_S (w1[0], w0[3], selector); w3[0] = __byte_perm_S (w0[3], w0[2], selector); w2[3] = __byte_perm_S (w0[2], w0[1], selector); w2[2] = __byte_perm_S (w0[1], w0[0], selector); w2[1] = __byte_perm_S (w0[0], 0, selector); w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 10: c2[2] = __byte_perm_S ( 0, w3[3], selector); c2[1] = __byte_perm_S (w3[3], w3[2], selector); c2[0] = __byte_perm_S (w3[2], w3[1], selector); c1[3] = __byte_perm_S (w3[1], w3[0], selector); c1[2] = __byte_perm_S (w3[0], w2[3], selector); c1[1] = __byte_perm_S (w2[3], w2[2], selector); c1[0] = __byte_perm_S (w2[2], w2[1], selector); c0[3] = __byte_perm_S (w2[1], w2[0], selector); c0[2] = __byte_perm_S (w2[0], w1[3], selector); c0[1] = __byte_perm_S (w1[3], w1[2], selector); c0[0] = __byte_perm_S (w1[2], w1[1], selector); w3[3] = __byte_perm_S (w1[1], w1[0], selector); w3[2] = __byte_perm_S (w1[0], w0[3], selector); w3[1] = __byte_perm_S (w0[3], w0[2], selector); w3[0] = __byte_perm_S (w0[2], w0[1], selector); w2[3] = __byte_perm_S (w0[1], w0[0], selector); w2[2] = __byte_perm_S (w0[0], 0, selector); w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 11: c2[3] = __byte_perm_S ( 0, w3[3], selector); c2[2] = __byte_perm_S (w3[3], w3[2], selector); c2[1] = __byte_perm_S (w3[2], w3[1], selector); c2[0] = __byte_perm_S (w3[1], w3[0], selector); c1[3] = __byte_perm_S (w3[0], w2[3], selector); c1[2] = __byte_perm_S (w2[3], w2[2], selector); c1[1] = __byte_perm_S (w2[2], w2[1], selector); c1[0] = __byte_perm_S (w2[1], w2[0], selector); c0[3] = __byte_perm_S (w2[0], w1[3], selector); c0[2] = __byte_perm_S (w1[3], w1[2], selector); c0[1] = __byte_perm_S (w1[2], w1[1], selector); c0[0] = __byte_perm_S (w1[1], w1[0], selector); w3[3] = __byte_perm_S (w1[0], w0[3], selector); w3[2] = __byte_perm_S (w0[3], w0[2], selector); w3[1] = __byte_perm_S (w0[2], w0[1], selector); w3[0] = __byte_perm_S (w0[1], w0[0], selector); w2[3] = __byte_perm_S (w0[0], 0, selector); w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 12: c3[0] = __byte_perm_S ( 0, w3[3], selector); c2[3] = __byte_perm_S (w3[3], w3[2], selector); c2[2] = __byte_perm_S (w3[2], w3[1], selector); c2[1] = __byte_perm_S (w3[1], w3[0], selector); c2[0] = __byte_perm_S (w3[0], w2[3], selector); c1[3] = __byte_perm_S (w2[3], w2[2], selector); c1[2] = __byte_perm_S (w2[2], w2[1], selector); c1[1] = __byte_perm_S (w2[1], w2[0], selector); c1[0] = __byte_perm_S (w2[0], w1[3], selector); c0[3] = __byte_perm_S (w1[3], w1[2], selector); c0[2] = __byte_perm_S (w1[2], w1[1], selector); c0[1] = __byte_perm_S (w1[1], w1[0], selector); c0[0] = __byte_perm_S (w1[0], w0[3], selector); w3[3] = __byte_perm_S (w0[3], w0[2], selector); w3[2] = __byte_perm_S (w0[2], w0[1], selector); w3[1] = __byte_perm_S (w0[1], w0[0], selector); w3[0] = __byte_perm_S (w0[0], 0, selector); w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 13: c3[1] = __byte_perm_S ( 0, w3[3], selector); c3[0] = __byte_perm_S (w3[3], w3[2], selector); c2[3] = __byte_perm_S (w3[2], w3[1], selector); c2[2] = __byte_perm_S (w3[1], w3[0], selector); c2[1] = __byte_perm_S (w3[0], w2[3], selector); c2[0] = __byte_perm_S (w2[3], w2[2], selector); c1[3] = __byte_perm_S (w2[2], w2[1], selector); c1[2] = __byte_perm_S (w2[1], w2[0], selector); c1[1] = __byte_perm_S (w2[0], w1[3], selector); c1[0] = __byte_perm_S (w1[3], w1[2], selector); c0[3] = __byte_perm_S (w1[2], w1[1], selector); c0[2] = __byte_perm_S (w1[1], w1[0], selector); c0[1] = __byte_perm_S (w1[0], w0[3], selector); c0[0] = __byte_perm_S (w0[3], w0[2], selector); w3[3] = __byte_perm_S (w0[2], w0[1], selector); w3[2] = __byte_perm_S (w0[1], w0[0], selector); w3[1] = __byte_perm_S (w0[0], 0, selector); w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 14: c3[2] = __byte_perm_S ( 0, w3[3], selector); c3[1] = __byte_perm_S (w3[3], w3[2], selector); c3[0] = __byte_perm_S (w3[2], w3[1], selector); c2[3] = __byte_perm_S (w3[1], w3[0], selector); c2[2] = __byte_perm_S (w3[0], w2[3], selector); c2[1] = __byte_perm_S (w2[3], w2[2], selector); c2[0] = __byte_perm_S (w2[2], w2[1], selector); c1[3] = __byte_perm_S (w2[1], w2[0], selector); c1[2] = __byte_perm_S (w2[0], w1[3], selector); c1[1] = __byte_perm_S (w1[3], w1[2], selector); c1[0] = __byte_perm_S (w1[2], w1[1], selector); c0[3] = __byte_perm_S (w1[1], w1[0], selector); c0[2] = __byte_perm_S (w1[0], w0[3], selector); c0[1] = __byte_perm_S (w0[3], w0[2], selector); c0[0] = __byte_perm_S (w0[2], w0[1], selector); w3[3] = __byte_perm_S (w0[1], w0[0], selector); w3[2] = __byte_perm_S (w0[0], 0, selector); w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 15: c3[3] = __byte_perm_S ( 0, w3[3], selector); c3[2] = __byte_perm_S (w3[3], w3[2], selector); c3[1] = __byte_perm_S (w3[2], w3[1], selector); c3[0] = __byte_perm_S (w3[1], w3[0], selector); c2[3] = __byte_perm_S (w3[0], w2[3], selector); c2[2] = __byte_perm_S (w2[3], w2[2], selector); c2[1] = __byte_perm_S (w2[2], w2[1], selector); c2[0] = __byte_perm_S (w2[1], w2[0], selector); c1[3] = __byte_perm_S (w2[0], w1[3], selector); c1[2] = __byte_perm_S (w1[3], w1[2], selector); c1[1] = __byte_perm_S (w1[2], w1[1], selector); c1[0] = __byte_perm_S (w1[1], w1[0], selector); c0[3] = __byte_perm_S (w1[0], w0[3], selector); c0[2] = __byte_perm_S (w0[3], w0[2], selector); c0[1] = __byte_perm_S (w0[2], w0[1], selector); c0[0] = __byte_perm_S (w0[1], w0[0], selector); w3[3] = __byte_perm_S (w0[0], 0, selector); w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; } #endif } DECLSPEC void switch_buffer_by_offset_8x4_le_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset) { const int offset_mod_4 = offset & 3; const int offset_minus_4 = 4 - offset_mod_4; const int offset_switch = offset / 4; #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC w0[0] = swap32_S (w0[0]); w0[1] = swap32_S (w0[1]); w0[2] = swap32_S (w0[2]); w0[3] = swap32_S (w0[3]); w1[0] = swap32_S (w1[0]); w1[1] = swap32_S (w1[1]); w1[2] = swap32_S (w1[2]); w1[3] = swap32_S (w1[3]); w2[0] = swap32_S (w2[0]); w2[1] = swap32_S (w2[1]); w2[2] = swap32_S (w2[2]); w2[3] = swap32_S (w2[3]); w3[0] = swap32_S (w3[0]); w3[1] = swap32_S (w3[1]); w3[2] = swap32_S (w3[2]); w3[3] = swap32_S (w3[3]); w4[0] = swap32_S (w4[0]); w4[1] = swap32_S (w4[1]); w4[2] = swap32_S (w4[2]); w4[3] = swap32_S (w4[3]); w5[0] = swap32_S (w5[0]); w5[1] = swap32_S (w5[1]); w5[2] = swap32_S (w5[2]); w5[3] = swap32_S (w5[3]); w6[0] = swap32_S (w6[0]); w6[1] = swap32_S (w6[1]); w6[2] = swap32_S (w6[2]); w6[3] = swap32_S (w6[3]); w7[0] = swap32_S (w7[0]); w7[1] = swap32_S (w7[1]); w7[2] = swap32_S (w7[2]); w7[3] = swap32_S (w7[3]); switch (offset_switch) { case 0: w7[3] = amd_bytealign_S (w7[2], w7[3], offset); w7[2] = amd_bytealign_S (w7[1], w7[2], offset); w7[1] = amd_bytealign_S (w7[0], w7[1], offset); w7[0] = amd_bytealign_S (w6[3], w7[0], offset); w6[3] = amd_bytealign_S (w6[2], w6[3], offset); w6[2] = amd_bytealign_S (w6[1], w6[2], offset); w6[1] = amd_bytealign_S (w6[0], w6[1], offset); w6[0] = amd_bytealign_S (w5[3], w6[0], offset); w5[3] = amd_bytealign_S (w5[2], w5[3], offset); w5[2] = amd_bytealign_S (w5[1], w5[2], offset); w5[1] = amd_bytealign_S (w5[0], w5[1], offset); w5[0] = amd_bytealign_S (w4[3], w5[0], offset); w4[3] = amd_bytealign_S (w4[2], w4[3], offset); w4[2] = amd_bytealign_S (w4[1], w4[2], offset); w4[1] = amd_bytealign_S (w4[0], w4[1], offset); w4[0] = amd_bytealign_S (w3[3], w4[0], offset); w3[3] = amd_bytealign_S (w3[2], w3[3], offset); w3[2] = amd_bytealign_S (w3[1], w3[2], offset); w3[1] = amd_bytealign_S (w3[0], w3[1], offset); w3[0] = amd_bytealign_S (w2[3], w3[0], offset); w2[3] = amd_bytealign_S (w2[2], w2[3], offset); w2[2] = amd_bytealign_S (w2[1], w2[2], offset); w2[1] = amd_bytealign_S (w2[0], w2[1], offset); w2[0] = amd_bytealign_S (w1[3], w2[0], offset); w1[3] = amd_bytealign_S (w1[2], w1[3], offset); w1[2] = amd_bytealign_S (w1[1], w1[2], offset); w1[1] = amd_bytealign_S (w1[0], w1[1], offset); w1[0] = amd_bytealign_S (w0[3], w1[0], offset); w0[3] = amd_bytealign_S (w0[2], w0[3], offset); w0[2] = amd_bytealign_S (w0[1], w0[2], offset); w0[1] = amd_bytealign_S (w0[0], w0[1], offset); w0[0] = amd_bytealign_S ( 0, w0[0], offset); break; case 1: w7[3] = amd_bytealign_S (w7[1], w7[2], offset); w7[2] = amd_bytealign_S (w7[0], w7[1], offset); w7[1] = amd_bytealign_S (w6[3], w7[0], offset); w7[0] = amd_bytealign_S (w6[2], w6[3], offset); w6[3] = amd_bytealign_S (w6[1], w6[2], offset); w6[2] = amd_bytealign_S (w6[0], w6[1], offset); w6[1] = amd_bytealign_S (w5[3], w6[0], offset); w6[0] = amd_bytealign_S (w5[2], w5[3], offset); w5[3] = amd_bytealign_S (w5[1], w5[2], offset); w5[2] = amd_bytealign_S (w5[0], w5[1], offset); w5[1] = amd_bytealign_S (w4[3], w5[0], offset); w5[0] = amd_bytealign_S (w4[2], w4[3], offset); w4[3] = amd_bytealign_S (w4[1], w4[2], offset); w4[2] = amd_bytealign_S (w4[0], w4[1], offset); w4[1] = amd_bytealign_S (w3[3], w4[0], offset); w4[0] = amd_bytealign_S (w3[2], w3[3], offset); w3[3] = amd_bytealign_S (w3[1], w3[2], offset); w3[2] = amd_bytealign_S (w3[0], w3[1], offset); w3[1] = amd_bytealign_S (w2[3], w3[0], offset); w3[0] = amd_bytealign_S (w2[2], w2[3], offset); w2[3] = amd_bytealign_S (w2[1], w2[2], offset); w2[2] = amd_bytealign_S (w2[0], w2[1], offset); w2[1] = amd_bytealign_S (w1[3], w2[0], offset); w2[0] = amd_bytealign_S (w1[2], w1[3], offset); w1[3] = amd_bytealign_S (w1[1], w1[2], offset); w1[2] = amd_bytealign_S (w1[0], w1[1], offset); w1[1] = amd_bytealign_S (w0[3], w1[0], offset); w1[0] = amd_bytealign_S (w0[2], w0[3], offset); w0[3] = amd_bytealign_S (w0[1], w0[2], offset); w0[2] = amd_bytealign_S (w0[0], w0[1], offset); w0[1] = amd_bytealign_S ( 0, w0[0], offset); w0[0] = 0; break; case 2: w7[3] = amd_bytealign_S (w7[0], w7[1], offset); w7[2] = amd_bytealign_S (w6[3], w7[0], offset); w7[1] = amd_bytealign_S (w6[2], w6[3], offset); w7[0] = amd_bytealign_S (w6[1], w6[2], offset); w6[3] = amd_bytealign_S (w6[0], w6[1], offset); w6[2] = amd_bytealign_S (w5[3], w6[0], offset); w6[1] = amd_bytealign_S (w5[2], w5[3], offset); w6[0] = amd_bytealign_S (w5[1], w5[2], offset); w5[3] = amd_bytealign_S (w5[0], w5[1], offset); w5[2] = amd_bytealign_S (w4[3], w5[0], offset); w5[1] = amd_bytealign_S (w4[2], w4[3], offset); w5[0] = amd_bytealign_S (w4[1], w4[2], offset); w4[3] = amd_bytealign_S (w4[0], w4[1], offset); w4[2] = amd_bytealign_S (w3[3], w4[0], offset); w4[1] = amd_bytealign_S (w3[2], w3[3], offset); w4[0] = amd_bytealign_S (w3[1], w3[2], offset); w3[3] = amd_bytealign_S (w3[0], w3[1], offset); w3[2] = amd_bytealign_S (w2[3], w3[0], offset); w3[1] = amd_bytealign_S (w2[2], w2[3], offset); w3[0] = amd_bytealign_S (w2[1], w2[2], offset); w2[3] = amd_bytealign_S (w2[0], w2[1], offset); w2[2] = amd_bytealign_S (w1[3], w2[0], offset); w2[1] = amd_bytealign_S (w1[2], w1[3], offset); w2[0] = amd_bytealign_S (w1[1], w1[2], offset); w1[3] = amd_bytealign_S (w1[0], w1[1], offset); w1[2] = amd_bytealign_S (w0[3], w1[0], offset); w1[1] = amd_bytealign_S (w0[2], w0[3], offset); w1[0] = amd_bytealign_S (w0[1], w0[2], offset); w0[3] = amd_bytealign_S (w0[0], w0[1], offset); w0[2] = amd_bytealign_S ( 0, w0[0], offset); w0[1] = 0; w0[0] = 0; break; case 3: w7[3] = amd_bytealign_S (w6[3], w7[0], offset); w7[2] = amd_bytealign_S (w6[2], w6[3], offset); w7[1] = amd_bytealign_S (w6[1], w6[2], offset); w7[0] = amd_bytealign_S (w6[0], w6[1], offset); w6[3] = amd_bytealign_S (w5[3], w6[0], offset); w6[2] = amd_bytealign_S (w5[2], w5[3], offset); w6[1] = amd_bytealign_S (w5[1], w5[2], offset); w6[0] = amd_bytealign_S (w5[0], w5[1], offset); w5[3] = amd_bytealign_S (w4[3], w5[0], offset); w5[2] = amd_bytealign_S (w4[2], w4[3], offset); w5[1] = amd_bytealign_S (w4[1], w4[2], offset); w5[0] = amd_bytealign_S (w4[0], w4[1], offset); w4[3] = amd_bytealign_S (w3[3], w4[0], offset); w4[2] = amd_bytealign_S (w3[2], w3[3], offset); w4[1] = amd_bytealign_S (w3[1], w3[2], offset); w4[0] = amd_bytealign_S (w3[0], w3[1], offset); w3[3] = amd_bytealign_S (w2[3], w3[0], offset); w3[2] = amd_bytealign_S (w2[2], w2[3], offset); w3[1] = amd_bytealign_S (w2[1], w2[2], offset); w3[0] = amd_bytealign_S (w2[0], w2[1], offset); w2[3] = amd_bytealign_S (w1[3], w2[0], offset); w2[2] = amd_bytealign_S (w1[2], w1[3], offset); w2[1] = amd_bytealign_S (w1[1], w1[2], offset); w2[0] = amd_bytealign_S (w1[0], w1[1], offset); w1[3] = amd_bytealign_S (w0[3], w1[0], offset); w1[2] = amd_bytealign_S (w0[2], w0[3], offset); w1[1] = amd_bytealign_S (w0[1], w0[2], offset); w1[0] = amd_bytealign_S (w0[0], w0[1], offset); w0[3] = amd_bytealign_S ( 0, w0[0], offset); w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 4: w7[3] = amd_bytealign_S (w6[2], w6[3], offset); w7[2] = amd_bytealign_S (w6[1], w6[2], offset); w7[1] = amd_bytealign_S (w6[0], w6[1], offset); w7[0] = amd_bytealign_S (w5[3], w6[0], offset); w6[3] = amd_bytealign_S (w5[2], w5[3], offset); w6[2] = amd_bytealign_S (w5[1], w5[2], offset); w6[1] = amd_bytealign_S (w5[0], w5[1], offset); w6[0] = amd_bytealign_S (w4[3], w5[0], offset); w5[3] = amd_bytealign_S (w4[2], w4[3], offset); w5[2] = amd_bytealign_S (w4[1], w4[2], offset); w5[1] = amd_bytealign_S (w4[0], w4[1], offset); w5[0] = amd_bytealign_S (w3[3], w4[0], offset); w4[3] = amd_bytealign_S (w3[2], w3[3], offset); w4[2] = amd_bytealign_S (w3[1], w3[2], offset); w4[1] = amd_bytealign_S (w3[0], w3[1], offset); w4[0] = amd_bytealign_S (w2[3], w3[0], offset); w3[3] = amd_bytealign_S (w2[2], w2[3], offset); w3[2] = amd_bytealign_S (w2[1], w2[2], offset); w3[1] = amd_bytealign_S (w2[0], w2[1], offset); w3[0] = amd_bytealign_S (w1[3], w2[0], offset); w2[3] = amd_bytealign_S (w1[2], w1[3], offset); w2[2] = amd_bytealign_S (w1[1], w1[2], offset); w2[1] = amd_bytealign_S (w1[0], w1[1], offset); w2[0] = amd_bytealign_S (w0[3], w1[0], offset); w1[3] = amd_bytealign_S (w0[2], w0[3], offset); w1[2] = amd_bytealign_S (w0[1], w0[2], offset); w1[1] = amd_bytealign_S (w0[0], w0[1], offset); w1[0] = amd_bytealign_S ( 0, w0[0], offset); w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 5: w7[3] = amd_bytealign_S (w6[1], w6[2], offset); w7[2] = amd_bytealign_S (w6[0], w6[1], offset); w7[1] = amd_bytealign_S (w5[3], w6[0], offset); w7[0] = amd_bytealign_S (w5[2], w5[3], offset); w6[3] = amd_bytealign_S (w5[1], w5[2], offset); w6[2] = amd_bytealign_S (w5[0], w5[1], offset); w6[1] = amd_bytealign_S (w4[3], w5[0], offset); w6[0] = amd_bytealign_S (w4[2], w4[3], offset); w5[3] = amd_bytealign_S (w4[1], w4[2], offset); w5[2] = amd_bytealign_S (w4[0], w4[1], offset); w5[1] = amd_bytealign_S (w3[3], w4[0], offset); w5[0] = amd_bytealign_S (w3[2], w3[3], offset); w4[3] = amd_bytealign_S (w3[1], w3[2], offset); w4[2] = amd_bytealign_S (w3[0], w3[1], offset); w4[1] = amd_bytealign_S (w2[3], w3[0], offset); w4[0] = amd_bytealign_S (w2[2], w2[3], offset); w3[3] = amd_bytealign_S (w2[1], w2[2], offset); w3[2] = amd_bytealign_S (w2[0], w2[1], offset); w3[1] = amd_bytealign_S (w1[3], w2[0], offset); w3[0] = amd_bytealign_S (w1[2], w1[3], offset); w2[3] = amd_bytealign_S (w1[1], w1[2], offset); w2[2] = amd_bytealign_S (w1[0], w1[1], offset); w2[1] = amd_bytealign_S (w0[3], w1[0], offset); w2[0] = amd_bytealign_S (w0[2], w0[3], offset); w1[3] = amd_bytealign_S (w0[1], w0[2], offset); w1[2] = amd_bytealign_S (w0[0], w0[1], offset); w1[1] = amd_bytealign_S ( 0, w0[0], offset); w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 6: w7[3] = amd_bytealign_S (w6[0], w6[1], offset); w7[2] = amd_bytealign_S (w5[3], w6[0], offset); w7[1] = amd_bytealign_S (w5[2], w5[3], offset); w7[0] = amd_bytealign_S (w5[1], w5[2], offset); w6[3] = amd_bytealign_S (w5[0], w5[1], offset); w6[2] = amd_bytealign_S (w4[3], w5[0], offset); w6[1] = amd_bytealign_S (w4[2], w4[3], offset); w6[0] = amd_bytealign_S (w4[1], w4[2], offset); w5[3] = amd_bytealign_S (w4[0], w4[1], offset); w5[2] = amd_bytealign_S (w3[3], w4[0], offset); w5[1] = amd_bytealign_S (w3[2], w3[3], offset); w5[0] = amd_bytealign_S (w3[1], w3[2], offset); w4[3] = amd_bytealign_S (w3[0], w3[1], offset); w4[2] = amd_bytealign_S (w2[3], w3[0], offset); w4[1] = amd_bytealign_S (w2[2], w2[3], offset); w4[0] = amd_bytealign_S (w2[1], w2[2], offset); w3[3] = amd_bytealign_S (w2[0], w2[1], offset); w3[2] = amd_bytealign_S (w1[3], w2[0], offset); w3[1] = amd_bytealign_S (w1[2], w1[3], offset); w3[0] = amd_bytealign_S (w1[1], w1[2], offset); w2[3] = amd_bytealign_S (w1[0], w1[1], offset); w2[2] = amd_bytealign_S (w0[3], w1[0], offset); w2[1] = amd_bytealign_S (w0[2], w0[3], offset); w2[0] = amd_bytealign_S (w0[1], w0[2], offset); w1[3] = amd_bytealign_S (w0[0], w0[1], offset); w1[2] = amd_bytealign_S ( 0, w0[0], offset); w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 7: w7[3] = amd_bytealign_S (w5[3], w6[0], offset); w7[2] = amd_bytealign_S (w5[2], w5[3], offset); w7[1] = amd_bytealign_S (w5[1], w5[2], offset); w7[0] = amd_bytealign_S (w5[0], w5[1], offset); w6[3] = amd_bytealign_S (w4[3], w5[0], offset); w6[2] = amd_bytealign_S (w4[2], w4[3], offset); w6[1] = amd_bytealign_S (w4[1], w4[2], offset); w6[0] = amd_bytealign_S (w4[0], w4[1], offset); w5[3] = amd_bytealign_S (w3[3], w4[0], offset); w5[2] = amd_bytealign_S (w3[2], w3[3], offset); w5[1] = amd_bytealign_S (w3[1], w3[2], offset); w5[0] = amd_bytealign_S (w3[0], w3[1], offset); w4[3] = amd_bytealign_S (w2[3], w3[0], offset); w4[2] = amd_bytealign_S (w2[2], w2[3], offset); w4[1] = amd_bytealign_S (w2[1], w2[2], offset); w4[0] = amd_bytealign_S (w2[0], w2[1], offset); w3[3] = amd_bytealign_S (w1[3], w2[0], offset); w3[2] = amd_bytealign_S (w1[2], w1[3], offset); w3[1] = amd_bytealign_S (w1[1], w1[2], offset); w3[0] = amd_bytealign_S (w1[0], w1[1], offset); w2[3] = amd_bytealign_S (w0[3], w1[0], offset); w2[2] = amd_bytealign_S (w0[2], w0[3], offset); w2[1] = amd_bytealign_S (w0[1], w0[2], offset); w2[0] = amd_bytealign_S (w0[0], w0[1], offset); w1[3] = amd_bytealign_S ( 0, w0[0], offset); w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 8: w7[3] = amd_bytealign_S (w5[2], w5[3], offset); w7[2] = amd_bytealign_S (w5[1], w5[2], offset); w7[1] = amd_bytealign_S (w5[0], w5[1], offset); w7[0] = amd_bytealign_S (w4[3], w5[0], offset); w6[3] = amd_bytealign_S (w4[2], w4[3], offset); w6[2] = amd_bytealign_S (w4[1], w4[2], offset); w6[1] = amd_bytealign_S (w4[0], w4[1], offset); w6[0] = amd_bytealign_S (w3[3], w4[0], offset); w5[3] = amd_bytealign_S (w3[2], w3[3], offset); w5[2] = amd_bytealign_S (w3[1], w3[2], offset); w5[1] = amd_bytealign_S (w3[0], w3[1], offset); w5[0] = amd_bytealign_S (w2[3], w3[0], offset); w4[3] = amd_bytealign_S (w2[2], w2[3], offset); w4[2] = amd_bytealign_S (w2[1], w2[2], offset); w4[1] = amd_bytealign_S (w2[0], w2[1], offset); w4[0] = amd_bytealign_S (w1[3], w2[0], offset); w3[3] = amd_bytealign_S (w1[2], w1[3], offset); w3[2] = amd_bytealign_S (w1[1], w1[2], offset); w3[1] = amd_bytealign_S (w1[0], w1[1], offset); w3[0] = amd_bytealign_S (w0[3], w1[0], offset); w2[3] = amd_bytealign_S (w0[2], w0[3], offset); w2[2] = amd_bytealign_S (w0[1], w0[2], offset); w2[1] = amd_bytealign_S (w0[0], w0[1], offset); w2[0] = amd_bytealign_S ( 0, w0[0], offset); w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 9: w7[3] = amd_bytealign_S (w5[1], w5[2], offset); w7[2] = amd_bytealign_S (w5[0], w5[1], offset); w7[1] = amd_bytealign_S (w4[3], w5[0], offset); w7[0] = amd_bytealign_S (w4[2], w4[3], offset); w6[3] = amd_bytealign_S (w4[1], w4[2], offset); w6[2] = amd_bytealign_S (w4[0], w4[1], offset); w6[1] = amd_bytealign_S (w3[3], w4[0], offset); w6[0] = amd_bytealign_S (w3[2], w3[3], offset); w5[3] = amd_bytealign_S (w3[1], w3[2], offset); w5[2] = amd_bytealign_S (w3[0], w3[1], offset); w5[1] = amd_bytealign_S (w2[3], w3[0], offset); w5[0] = amd_bytealign_S (w2[2], w2[3], offset); w4[3] = amd_bytealign_S (w2[1], w2[2], offset); w4[2] = amd_bytealign_S (w2[0], w2[1], offset); w4[1] = amd_bytealign_S (w1[3], w2[0], offset); w4[0] = amd_bytealign_S (w1[2], w1[3], offset); w3[3] = amd_bytealign_S (w1[1], w1[2], offset); w3[2] = amd_bytealign_S (w1[0], w1[1], offset); w3[1] = amd_bytealign_S (w0[3], w1[0], offset); w3[0] = amd_bytealign_S (w0[2], w0[3], offset); w2[3] = amd_bytealign_S (w0[1], w0[2], offset); w2[2] = amd_bytealign_S (w0[0], w0[1], offset); w2[1] = amd_bytealign_S ( 0, w0[0], offset); w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 10: w7[3] = amd_bytealign_S (w5[0], w5[1], offset); w7[2] = amd_bytealign_S (w4[3], w5[0], offset); w7[1] = amd_bytealign_S (w4[2], w4[3], offset); w7[0] = amd_bytealign_S (w4[1], w4[2], offset); w6[3] = amd_bytealign_S (w4[0], w4[1], offset); w6[2] = amd_bytealign_S (w3[3], w4[0], offset); w6[1] = amd_bytealign_S (w3[2], w3[3], offset); w6[0] = amd_bytealign_S (w3[1], w3[2], offset); w5[3] = amd_bytealign_S (w3[0], w3[1], offset); w5[2] = amd_bytealign_S (w2[3], w3[0], offset); w5[1] = amd_bytealign_S (w2[2], w2[3], offset); w5[0] = amd_bytealign_S (w2[1], w2[2], offset); w4[3] = amd_bytealign_S (w2[0], w2[1], offset); w4[2] = amd_bytealign_S (w1[3], w2[0], offset); w4[1] = amd_bytealign_S (w1[2], w1[3], offset); w4[0] = amd_bytealign_S (w1[1], w1[2], offset); w3[3] = amd_bytealign_S (w1[0], w1[1], offset); w3[2] = amd_bytealign_S (w0[3], w1[0], offset); w3[1] = amd_bytealign_S (w0[2], w0[3], offset); w3[0] = amd_bytealign_S (w0[1], w0[2], offset); w2[3] = amd_bytealign_S (w0[0], w0[1], offset); w2[2] = amd_bytealign_S ( 0, w0[0], offset); w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 11: w7[3] = amd_bytealign_S (w4[3], w5[0], offset); w7[2] = amd_bytealign_S (w4[2], w4[3], offset); w7[1] = amd_bytealign_S (w4[1], w4[2], offset); w7[0] = amd_bytealign_S (w4[0], w4[1], offset); w6[3] = amd_bytealign_S (w3[3], w4[0], offset); w6[2] = amd_bytealign_S (w3[2], w3[3], offset); w6[1] = amd_bytealign_S (w3[1], w3[2], offset); w6[0] = amd_bytealign_S (w3[0], w3[1], offset); w5[3] = amd_bytealign_S (w2[3], w3[0], offset); w5[2] = amd_bytealign_S (w2[2], w2[3], offset); w5[1] = amd_bytealign_S (w2[1], w2[2], offset); w5[0] = amd_bytealign_S (w2[0], w2[1], offset); w4[3] = amd_bytealign_S (w1[3], w2[0], offset); w4[2] = amd_bytealign_S (w1[2], w1[3], offset); w4[1] = amd_bytealign_S (w1[1], w1[2], offset); w4[0] = amd_bytealign_S (w1[0], w1[1], offset); w3[3] = amd_bytealign_S (w0[3], w1[0], offset); w3[2] = amd_bytealign_S (w0[2], w0[3], offset); w3[1] = amd_bytealign_S (w0[1], w0[2], offset); w3[0] = amd_bytealign_S (w0[0], w0[1], offset); w2[3] = amd_bytealign_S ( 0, w0[0], offset); w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 12: w7[3] = amd_bytealign_S (w4[2], w4[3], offset); w7[2] = amd_bytealign_S (w4[1], w4[2], offset); w7[1] = amd_bytealign_S (w4[0], w4[1], offset); w7[0] = amd_bytealign_S (w3[3], w4[0], offset); w6[3] = amd_bytealign_S (w3[2], w3[3], offset); w6[2] = amd_bytealign_S (w3[1], w3[2], offset); w6[1] = amd_bytealign_S (w3[0], w3[1], offset); w6[0] = amd_bytealign_S (w2[3], w3[0], offset); w5[3] = amd_bytealign_S (w2[2], w2[3], offset); w5[2] = amd_bytealign_S (w2[1], w2[2], offset); w5[1] = amd_bytealign_S (w2[0], w2[1], offset); w5[0] = amd_bytealign_S (w1[3], w2[0], offset); w4[3] = amd_bytealign_S (w1[2], w1[3], offset); w4[2] = amd_bytealign_S (w1[1], w1[2], offset); w4[1] = amd_bytealign_S (w1[0], w1[1], offset); w4[0] = amd_bytealign_S (w0[3], w1[0], offset); w3[3] = amd_bytealign_S (w0[2], w0[3], offset); w3[2] = amd_bytealign_S (w0[1], w0[2], offset); w3[1] = amd_bytealign_S (w0[0], w0[1], offset); w3[0] = amd_bytealign_S ( 0, w0[0], offset); w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 13: w7[3] = amd_bytealign_S (w4[1], w4[2], offset); w7[2] = amd_bytealign_S (w4[0], w4[1], offset); w7[1] = amd_bytealign_S (w3[3], w4[0], offset); w7[0] = amd_bytealign_S (w3[2], w3[3], offset); w6[3] = amd_bytealign_S (w3[1], w3[2], offset); w6[2] = amd_bytealign_S (w3[0], w3[1], offset); w6[1] = amd_bytealign_S (w2[3], w3[0], offset); w6[0] = amd_bytealign_S (w2[2], w2[3], offset); w5[3] = amd_bytealign_S (w2[1], w2[2], offset); w5[2] = amd_bytealign_S (w2[0], w2[1], offset); w5[1] = amd_bytealign_S (w1[3], w2[0], offset); w5[0] = amd_bytealign_S (w1[2], w1[3], offset); w4[3] = amd_bytealign_S (w1[1], w1[2], offset); w4[2] = amd_bytealign_S (w1[0], w1[1], offset); w4[1] = amd_bytealign_S (w0[3], w1[0], offset); w4[0] = amd_bytealign_S (w0[2], w0[3], offset); w3[3] = amd_bytealign_S (w0[1], w0[2], offset); w3[2] = amd_bytealign_S (w0[0], w0[1], offset); w3[1] = amd_bytealign_S ( 0, w0[0], offset); w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 14: w7[3] = amd_bytealign_S (w4[0], w4[1], offset); w7[2] = amd_bytealign_S (w3[3], w4[0], offset); w7[1] = amd_bytealign_S (w3[2], w3[3], offset); w7[0] = amd_bytealign_S (w3[1], w3[2], offset); w6[3] = amd_bytealign_S (w3[0], w3[1], offset); w6[2] = amd_bytealign_S (w2[3], w3[0], offset); w6[1] = amd_bytealign_S (w2[2], w2[3], offset); w6[0] = amd_bytealign_S (w2[1], w2[2], offset); w5[3] = amd_bytealign_S (w2[0], w2[1], offset); w5[2] = amd_bytealign_S (w1[3], w2[0], offset); w5[1] = amd_bytealign_S (w1[2], w1[3], offset); w5[0] = amd_bytealign_S (w1[1], w1[2], offset); w4[3] = amd_bytealign_S (w1[0], w1[1], offset); w4[2] = amd_bytealign_S (w0[3], w1[0], offset); w4[1] = amd_bytealign_S (w0[2], w0[3], offset); w4[0] = amd_bytealign_S (w0[1], w0[2], offset); w3[3] = amd_bytealign_S (w0[0], w0[1], offset); w3[2] = amd_bytealign_S ( 0, w0[0], offset); w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 15: w7[3] = amd_bytealign_S (w3[3], w4[0], offset); w7[2] = amd_bytealign_S (w3[2], w3[3], offset); w7[1] = amd_bytealign_S (w3[1], w3[2], offset); w7[0] = amd_bytealign_S (w3[0], w3[1], offset); w6[3] = amd_bytealign_S (w2[3], w3[0], offset); w6[2] = amd_bytealign_S (w2[2], w2[3], offset); w6[1] = amd_bytealign_S (w2[1], w2[2], offset); w6[0] = amd_bytealign_S (w2[0], w2[1], offset); w5[3] = amd_bytealign_S (w1[3], w2[0], offset); w5[2] = amd_bytealign_S (w1[2], w1[3], offset); w5[1] = amd_bytealign_S (w1[1], w1[2], offset); w5[0] = amd_bytealign_S (w1[0], w1[1], offset); w4[3] = amd_bytealign_S (w0[3], w1[0], offset); w4[2] = amd_bytealign_S (w0[2], w0[3], offset); w4[1] = amd_bytealign_S (w0[1], w0[2], offset); w4[0] = amd_bytealign_S (w0[0], w0[1], offset); w3[3] = amd_bytealign_S ( 0, w0[0], offset); w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 16: w7[3] = amd_bytealign_S (w3[2], w3[3], offset); w7[2] = amd_bytealign_S (w3[1], w3[2], offset); w7[1] = amd_bytealign_S (w3[0], w3[1], offset); w7[0] = amd_bytealign_S (w2[3], w3[0], offset); w6[3] = amd_bytealign_S (w2[2], w2[3], offset); w6[2] = amd_bytealign_S (w2[1], w2[2], offset); w6[1] = amd_bytealign_S (w2[0], w2[1], offset); w6[0] = amd_bytealign_S (w1[3], w2[0], offset); w5[3] = amd_bytealign_S (w1[2], w1[3], offset); w5[2] = amd_bytealign_S (w1[1], w1[2], offset); w5[1] = amd_bytealign_S (w1[0], w1[1], offset); w5[0] = amd_bytealign_S (w0[3], w1[0], offset); w4[3] = amd_bytealign_S (w0[2], w0[3], offset); w4[2] = amd_bytealign_S (w0[1], w0[2], offset); w4[1] = amd_bytealign_S (w0[0], w0[1], offset); w4[0] = amd_bytealign_S ( 0, w0[0], offset); w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 17: w7[3] = amd_bytealign_S (w3[1], w3[2], offset); w7[2] = amd_bytealign_S (w3[0], w3[1], offset); w7[1] = amd_bytealign_S (w2[3], w3[0], offset); w7[0] = amd_bytealign_S (w2[2], w2[3], offset); w6[3] = amd_bytealign_S (w2[1], w2[2], offset); w6[2] = amd_bytealign_S (w2[0], w2[1], offset); w6[1] = amd_bytealign_S (w1[3], w2[0], offset); w6[0] = amd_bytealign_S (w1[2], w1[3], offset); w5[3] = amd_bytealign_S (w1[1], w1[2], offset); w5[2] = amd_bytealign_S (w1[0], w1[1], offset); w5[1] = amd_bytealign_S (w0[3], w1[0], offset); w5[0] = amd_bytealign_S (w0[2], w0[3], offset); w4[3] = amd_bytealign_S (w0[1], w0[2], offset); w4[2] = amd_bytealign_S (w0[0], w0[1], offset); w4[1] = amd_bytealign_S ( 0, w0[0], offset); w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 18: w7[3] = amd_bytealign_S (w3[0], w3[1], offset); w7[2] = amd_bytealign_S (w2[3], w3[0], offset); w7[1] = amd_bytealign_S (w2[2], w2[3], offset); w7[0] = amd_bytealign_S (w2[1], w2[2], offset); w6[3] = amd_bytealign_S (w2[0], w2[1], offset); w6[2] = amd_bytealign_S (w1[3], w2[0], offset); w6[1] = amd_bytealign_S (w1[2], w1[3], offset); w6[0] = amd_bytealign_S (w1[1], w1[2], offset); w5[3] = amd_bytealign_S (w1[0], w1[1], offset); w5[2] = amd_bytealign_S (w0[3], w1[0], offset); w5[1] = amd_bytealign_S (w0[2], w0[3], offset); w5[0] = amd_bytealign_S (w0[1], w0[2], offset); w4[3] = amd_bytealign_S (w0[0], w0[1], offset); w4[2] = amd_bytealign_S ( 0, w0[0], offset); w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 19: w7[3] = amd_bytealign_S (w2[3], w3[0], offset); w7[2] = amd_bytealign_S (w2[2], w2[3], offset); w7[1] = amd_bytealign_S (w2[1], w2[2], offset); w7[0] = amd_bytealign_S (w2[0], w2[1], offset); w6[3] = amd_bytealign_S (w1[3], w2[0], offset); w6[2] = amd_bytealign_S (w1[2], w1[3], offset); w6[1] = amd_bytealign_S (w1[1], w1[2], offset); w6[0] = amd_bytealign_S (w1[0], w1[1], offset); w5[3] = amd_bytealign_S (w0[3], w1[0], offset); w5[2] = amd_bytealign_S (w0[2], w0[3], offset); w5[1] = amd_bytealign_S (w0[1], w0[2], offset); w5[0] = amd_bytealign_S (w0[0], w0[1], offset); w4[3] = amd_bytealign_S ( 0, w0[0], offset); w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 20: w7[3] = amd_bytealign_S (w2[2], w2[3], offset); w7[2] = amd_bytealign_S (w2[1], w2[2], offset); w7[1] = amd_bytealign_S (w2[0], w2[1], offset); w7[0] = amd_bytealign_S (w1[3], w2[0], offset); w6[3] = amd_bytealign_S (w1[2], w1[3], offset); w6[2] = amd_bytealign_S (w1[1], w1[2], offset); w6[1] = amd_bytealign_S (w1[0], w1[1], offset); w6[0] = amd_bytealign_S (w0[3], w1[0], offset); w5[3] = amd_bytealign_S (w0[2], w0[3], offset); w5[2] = amd_bytealign_S (w0[1], w0[2], offset); w5[1] = amd_bytealign_S (w0[0], w0[1], offset); w5[0] = amd_bytealign_S ( 0, w0[0], offset); w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 21: w7[3] = amd_bytealign_S (w2[1], w2[2], offset); w7[2] = amd_bytealign_S (w2[0], w2[1], offset); w7[1] = amd_bytealign_S (w1[3], w2[0], offset); w7[0] = amd_bytealign_S (w1[2], w1[3], offset); w6[3] = amd_bytealign_S (w1[1], w1[2], offset); w6[2] = amd_bytealign_S (w1[0], w1[1], offset); w6[1] = amd_bytealign_S (w0[3], w1[0], offset); w6[0] = amd_bytealign_S (w0[2], w0[3], offset); w5[3] = amd_bytealign_S (w0[1], w0[2], offset); w5[2] = amd_bytealign_S (w0[0], w0[1], offset); w5[1] = amd_bytealign_S ( 0, w0[0], offset); w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 22: w7[3] = amd_bytealign_S (w2[0], w2[1], offset); w7[2] = amd_bytealign_S (w1[3], w2[0], offset); w7[1] = amd_bytealign_S (w1[2], w1[3], offset); w7[0] = amd_bytealign_S (w1[1], w1[2], offset); w6[3] = amd_bytealign_S (w1[0], w1[1], offset); w6[2] = amd_bytealign_S (w0[3], w1[0], offset); w6[1] = amd_bytealign_S (w0[2], w0[3], offset); w6[0] = amd_bytealign_S (w0[1], w0[2], offset); w5[3] = amd_bytealign_S (w0[0], w0[1], offset); w5[2] = amd_bytealign_S ( 0, w0[0], offset); w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 23: w7[3] = amd_bytealign_S (w1[3], w2[0], offset); w7[2] = amd_bytealign_S (w1[2], w1[3], offset); w7[1] = amd_bytealign_S (w1[1], w1[2], offset); w7[0] = amd_bytealign_S (w1[0], w1[1], offset); w6[3] = amd_bytealign_S (w0[3], w1[0], offset); w6[2] = amd_bytealign_S (w0[2], w0[3], offset); w6[1] = amd_bytealign_S (w0[1], w0[2], offset); w6[0] = amd_bytealign_S (w0[0], w0[1], offset); w5[3] = amd_bytealign_S ( 0, w0[0], offset); w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 24: w7[3] = amd_bytealign_S (w1[2], w1[3], offset); w7[2] = amd_bytealign_S (w1[1], w1[2], offset); w7[1] = amd_bytealign_S (w1[0], w1[1], offset); w7[0] = amd_bytealign_S (w0[3], w1[0], offset); w6[3] = amd_bytealign_S (w0[2], w0[3], offset); w6[2] = amd_bytealign_S (w0[1], w0[2], offset); w6[1] = amd_bytealign_S (w0[0], w0[1], offset); w6[0] = amd_bytealign_S ( 0, w0[0], offset); w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 25: w7[3] = amd_bytealign_S (w1[1], w1[2], offset); w7[2] = amd_bytealign_S (w1[0], w1[1], offset); w7[1] = amd_bytealign_S (w0[3], w1[0], offset); w7[0] = amd_bytealign_S (w0[2], w0[3], offset); w6[3] = amd_bytealign_S (w0[1], w0[2], offset); w6[2] = amd_bytealign_S (w0[0], w0[1], offset); w6[1] = amd_bytealign_S ( 0, w0[0], offset); w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 26: w7[3] = amd_bytealign_S (w1[0], w1[1], offset); w7[2] = amd_bytealign_S (w0[3], w1[0], offset); w7[1] = amd_bytealign_S (w0[2], w0[3], offset); w7[0] = amd_bytealign_S (w0[1], w0[2], offset); w6[3] = amd_bytealign_S (w0[0], w0[1], offset); w6[2] = amd_bytealign_S ( 0, w0[0], offset); w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 27: w7[3] = amd_bytealign_S (w0[3], w1[0], offset); w7[2] = amd_bytealign_S (w0[2], w0[3], offset); w7[1] = amd_bytealign_S (w0[1], w0[2], offset); w7[0] = amd_bytealign_S (w0[0], w0[1], offset); w6[3] = amd_bytealign_S ( 0, w0[0], offset); w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 28: w7[3] = amd_bytealign_S (w0[2], w0[3], offset); w7[2] = amd_bytealign_S (w0[1], w0[2], offset); w7[1] = amd_bytealign_S (w0[0], w0[1], offset); w7[0] = amd_bytealign_S ( 0, w0[0], offset); w6[3] = 0; w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 29: w7[3] = amd_bytealign_S (w0[1], w0[2], offset); w7[2] = amd_bytealign_S (w0[0], w0[1], offset); w7[1] = amd_bytealign_S ( 0, w0[0], offset); w7[0] = 0; w6[3] = 0; w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 30: w7[3] = amd_bytealign_S (w0[0], w0[1], offset); w7[2] = amd_bytealign_S ( 0, w0[0], offset); w7[1] = 0; w7[0] = 0; w6[3] = 0; w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 31: w7[3] = amd_bytealign_S ( 0, w0[0], offset); w7[2] = 0; w7[1] = 0; w7[0] = 0; w6[3] = 0; w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; } w0[0] = swap32_S (w0[0]); w0[1] = swap32_S (w0[1]); w0[2] = swap32_S (w0[2]); w0[3] = swap32_S (w0[3]); w1[0] = swap32_S (w1[0]); w1[1] = swap32_S (w1[1]); w1[2] = swap32_S (w1[2]); w1[3] = swap32_S (w1[3]); w2[0] = swap32_S (w2[0]); w2[1] = swap32_S (w2[1]); w2[2] = swap32_S (w2[2]); w2[3] = swap32_S (w2[3]); w3[0] = swap32_S (w3[0]); w3[1] = swap32_S (w3[1]); w3[2] = swap32_S (w3[2]); w3[3] = swap32_S (w3[3]); w4[0] = swap32_S (w4[0]); w4[1] = swap32_S (w4[1]); w4[2] = swap32_S (w4[2]); w4[3] = swap32_S (w4[3]); w5[0] = swap32_S (w5[0]); w5[1] = swap32_S (w5[1]); w5[2] = swap32_S (w5[2]); w5[3] = swap32_S (w5[3]); w6[0] = swap32_S (w6[0]); w6[1] = swap32_S (w6[1]); w6[2] = swap32_S (w6[2]); w6[3] = swap32_S (w6[3]); w7[0] = swap32_S (w7[0]); w7[1] = swap32_S (w7[1]); w7[2] = swap32_S (w7[2]); w7[3] = swap32_S (w7[3]); #endif #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV #if defined IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; #endif #if defined IS_AMD const int selector = 0x0706050403020100 >> (offset_minus_4 * 8); #endif switch (offset_switch) { case 0: w7[3] = __byte_perm_S (w7[2], w7[3], selector); w7[2] = __byte_perm_S (w7[1], w7[2], selector); w7[1] = __byte_perm_S (w7[0], w7[1], selector); w7[0] = __byte_perm_S (w6[3], w7[0], selector); w6[3] = __byte_perm_S (w6[2], w6[3], selector); w6[2] = __byte_perm_S (w6[1], w6[2], selector); w6[1] = __byte_perm_S (w6[0], w6[1], selector); w6[0] = __byte_perm_S (w5[3], w6[0], selector); w5[3] = __byte_perm_S (w5[2], w5[3], selector); w5[2] = __byte_perm_S (w5[1], w5[2], selector); w5[1] = __byte_perm_S (w5[0], w5[1], selector); w5[0] = __byte_perm_S (w4[3], w5[0], selector); w4[3] = __byte_perm_S (w4[2], w4[3], selector); w4[2] = __byte_perm_S (w4[1], w4[2], selector); w4[1] = __byte_perm_S (w4[0], w4[1], selector); w4[0] = __byte_perm_S (w3[3], w4[0], selector); w3[3] = __byte_perm_S (w3[2], w3[3], selector); w3[2] = __byte_perm_S (w3[1], w3[2], selector); w3[1] = __byte_perm_S (w3[0], w3[1], selector); w3[0] = __byte_perm_S (w2[3], w3[0], selector); w2[3] = __byte_perm_S (w2[2], w2[3], selector); w2[2] = __byte_perm_S (w2[1], w2[2], selector); w2[1] = __byte_perm_S (w2[0], w2[1], selector); w2[0] = __byte_perm_S (w1[3], w2[0], selector); w1[3] = __byte_perm_S (w1[2], w1[3], selector); w1[2] = __byte_perm_S (w1[1], w1[2], selector); w1[1] = __byte_perm_S (w1[0], w1[1], selector); w1[0] = __byte_perm_S (w0[3], w1[0], selector); w0[3] = __byte_perm_S (w0[2], w0[3], selector); w0[2] = __byte_perm_S (w0[1], w0[2], selector); w0[1] = __byte_perm_S (w0[0], w0[1], selector); w0[0] = __byte_perm_S ( 0, w0[0], selector); break; case 1: w7[3] = __byte_perm_S (w7[1], w7[2], selector); w7[2] = __byte_perm_S (w7[0], w7[1], selector); w7[1] = __byte_perm_S (w6[3], w7[0], selector); w7[0] = __byte_perm_S (w6[2], w6[3], selector); w6[3] = __byte_perm_S (w6[1], w6[2], selector); w6[2] = __byte_perm_S (w6[0], w6[1], selector); w6[1] = __byte_perm_S (w5[3], w6[0], selector); w6[0] = __byte_perm_S (w5[2], w5[3], selector); w5[3] = __byte_perm_S (w5[1], w5[2], selector); w5[2] = __byte_perm_S (w5[0], w5[1], selector); w5[1] = __byte_perm_S (w4[3], w5[0], selector); w5[0] = __byte_perm_S (w4[2], w4[3], selector); w4[3] = __byte_perm_S (w4[1], w4[2], selector); w4[2] = __byte_perm_S (w4[0], w4[1], selector); w4[1] = __byte_perm_S (w3[3], w4[0], selector); w4[0] = __byte_perm_S (w3[2], w3[3], selector); w3[3] = __byte_perm_S (w3[1], w3[2], selector); w3[2] = __byte_perm_S (w3[0], w3[1], selector); w3[1] = __byte_perm_S (w2[3], w3[0], selector); w3[0] = __byte_perm_S (w2[2], w2[3], selector); w2[3] = __byte_perm_S (w2[1], w2[2], selector); w2[2] = __byte_perm_S (w2[0], w2[1], selector); w2[1] = __byte_perm_S (w1[3], w2[0], selector); w2[0] = __byte_perm_S (w1[2], w1[3], selector); w1[3] = __byte_perm_S (w1[1], w1[2], selector); w1[2] = __byte_perm_S (w1[0], w1[1], selector); w1[1] = __byte_perm_S (w0[3], w1[0], selector); w1[0] = __byte_perm_S (w0[2], w0[3], selector); w0[3] = __byte_perm_S (w0[1], w0[2], selector); w0[2] = __byte_perm_S (w0[0], w0[1], selector); w0[1] = __byte_perm_S ( 0, w0[0], selector); w0[0] = 0; break; case 2: w7[3] = __byte_perm_S (w7[0], w7[1], selector); w7[2] = __byte_perm_S (w6[3], w7[0], selector); w7[1] = __byte_perm_S (w6[2], w6[3], selector); w7[0] = __byte_perm_S (w6[1], w6[2], selector); w6[3] = __byte_perm_S (w6[0], w6[1], selector); w6[2] = __byte_perm_S (w5[3], w6[0], selector); w6[1] = __byte_perm_S (w5[2], w5[3], selector); w6[0] = __byte_perm_S (w5[1], w5[2], selector); w5[3] = __byte_perm_S (w5[0], w5[1], selector); w5[2] = __byte_perm_S (w4[3], w5[0], selector); w5[1] = __byte_perm_S (w4[2], w4[3], selector); w5[0] = __byte_perm_S (w4[1], w4[2], selector); w4[3] = __byte_perm_S (w4[0], w4[1], selector); w4[2] = __byte_perm_S (w3[3], w4[0], selector); w4[1] = __byte_perm_S (w3[2], w3[3], selector); w4[0] = __byte_perm_S (w3[1], w3[2], selector); w3[3] = __byte_perm_S (w3[0], w3[1], selector); w3[2] = __byte_perm_S (w2[3], w3[0], selector); w3[1] = __byte_perm_S (w2[2], w2[3], selector); w3[0] = __byte_perm_S (w2[1], w2[2], selector); w2[3] = __byte_perm_S (w2[0], w2[1], selector); w2[2] = __byte_perm_S (w1[3], w2[0], selector); w2[1] = __byte_perm_S (w1[2], w1[3], selector); w2[0] = __byte_perm_S (w1[1], w1[2], selector); w1[3] = __byte_perm_S (w1[0], w1[1], selector); w1[2] = __byte_perm_S (w0[3], w1[0], selector); w1[1] = __byte_perm_S (w0[2], w0[3], selector); w1[0] = __byte_perm_S (w0[1], w0[2], selector); w0[3] = __byte_perm_S (w0[0], w0[1], selector); w0[2] = __byte_perm_S ( 0, w0[0], selector); w0[1] = 0; w0[0] = 0; break; case 3: w7[3] = __byte_perm_S (w6[3], w7[0], selector); w7[2] = __byte_perm_S (w6[2], w6[3], selector); w7[1] = __byte_perm_S (w6[1], w6[2], selector); w7[0] = __byte_perm_S (w6[0], w6[1], selector); w6[3] = __byte_perm_S (w5[3], w6[0], selector); w6[2] = __byte_perm_S (w5[2], w5[3], selector); w6[1] = __byte_perm_S (w5[1], w5[2], selector); w6[0] = __byte_perm_S (w5[0], w5[1], selector); w5[3] = __byte_perm_S (w4[3], w5[0], selector); w5[2] = __byte_perm_S (w4[2], w4[3], selector); w5[1] = __byte_perm_S (w4[1], w4[2], selector); w5[0] = __byte_perm_S (w4[0], w4[1], selector); w4[3] = __byte_perm_S (w3[3], w4[0], selector); w4[2] = __byte_perm_S (w3[2], w3[3], selector); w4[1] = __byte_perm_S (w3[1], w3[2], selector); w4[0] = __byte_perm_S (w3[0], w3[1], selector); w3[3] = __byte_perm_S (w2[3], w3[0], selector); w3[2] = __byte_perm_S (w2[2], w2[3], selector); w3[1] = __byte_perm_S (w2[1], w2[2], selector); w3[0] = __byte_perm_S (w2[0], w2[1], selector); w2[3] = __byte_perm_S (w1[3], w2[0], selector); w2[2] = __byte_perm_S (w1[2], w1[3], selector); w2[1] = __byte_perm_S (w1[1], w1[2], selector); w2[0] = __byte_perm_S (w1[0], w1[1], selector); w1[3] = __byte_perm_S (w0[3], w1[0], selector); w1[2] = __byte_perm_S (w0[2], w0[3], selector); w1[1] = __byte_perm_S (w0[1], w0[2], selector); w1[0] = __byte_perm_S (w0[0], w0[1], selector); w0[3] = __byte_perm_S ( 0, w0[0], selector); w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 4: w7[3] = __byte_perm_S (w6[2], w6[3], selector); w7[2] = __byte_perm_S (w6[1], w6[2], selector); w7[1] = __byte_perm_S (w6[0], w6[1], selector); w7[0] = __byte_perm_S (w5[3], w6[0], selector); w6[3] = __byte_perm_S (w5[2], w5[3], selector); w6[2] = __byte_perm_S (w5[1], w5[2], selector); w6[1] = __byte_perm_S (w5[0], w5[1], selector); w6[0] = __byte_perm_S (w4[3], w5[0], selector); w5[3] = __byte_perm_S (w4[2], w4[3], selector); w5[2] = __byte_perm_S (w4[1], w4[2], selector); w5[1] = __byte_perm_S (w4[0], w4[1], selector); w5[0] = __byte_perm_S (w3[3], w4[0], selector); w4[3] = __byte_perm_S (w3[2], w3[3], selector); w4[2] = __byte_perm_S (w3[1], w3[2], selector); w4[1] = __byte_perm_S (w3[0], w3[1], selector); w4[0] = __byte_perm_S (w2[3], w3[0], selector); w3[3] = __byte_perm_S (w2[2], w2[3], selector); w3[2] = __byte_perm_S (w2[1], w2[2], selector); w3[1] = __byte_perm_S (w2[0], w2[1], selector); w3[0] = __byte_perm_S (w1[3], w2[0], selector); w2[3] = __byte_perm_S (w1[2], w1[3], selector); w2[2] = __byte_perm_S (w1[1], w1[2], selector); w2[1] = __byte_perm_S (w1[0], w1[1], selector); w2[0] = __byte_perm_S (w0[3], w1[0], selector); w1[3] = __byte_perm_S (w0[2], w0[3], selector); w1[2] = __byte_perm_S (w0[1], w0[2], selector); w1[1] = __byte_perm_S (w0[0], w0[1], selector); w1[0] = __byte_perm_S ( 0, w0[0], selector); w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 5: w7[3] = __byte_perm_S (w6[1], w6[2], selector); w7[2] = __byte_perm_S (w6[0], w6[1], selector); w7[1] = __byte_perm_S (w5[3], w6[0], selector); w7[0] = __byte_perm_S (w5[2], w5[3], selector); w6[3] = __byte_perm_S (w5[1], w5[2], selector); w6[2] = __byte_perm_S (w5[0], w5[1], selector); w6[1] = __byte_perm_S (w4[3], w5[0], selector); w6[0] = __byte_perm_S (w4[2], w4[3], selector); w5[3] = __byte_perm_S (w4[1], w4[2], selector); w5[2] = __byte_perm_S (w4[0], w4[1], selector); w5[1] = __byte_perm_S (w3[3], w4[0], selector); w5[0] = __byte_perm_S (w3[2], w3[3], selector); w4[3] = __byte_perm_S (w3[1], w3[2], selector); w4[2] = __byte_perm_S (w3[0], w3[1], selector); w4[1] = __byte_perm_S (w2[3], w3[0], selector); w4[0] = __byte_perm_S (w2[2], w2[3], selector); w3[3] = __byte_perm_S (w2[1], w2[2], selector); w3[2] = __byte_perm_S (w2[0], w2[1], selector); w3[1] = __byte_perm_S (w1[3], w2[0], selector); w3[0] = __byte_perm_S (w1[2], w1[3], selector); w2[3] = __byte_perm_S (w1[1], w1[2], selector); w2[2] = __byte_perm_S (w1[0], w1[1], selector); w2[1] = __byte_perm_S (w0[3], w1[0], selector); w2[0] = __byte_perm_S (w0[2], w0[3], selector); w1[3] = __byte_perm_S (w0[1], w0[2], selector); w1[2] = __byte_perm_S (w0[0], w0[1], selector); w1[1] = __byte_perm_S ( 0, w0[0], selector); w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 6: w7[3] = __byte_perm_S (w6[0], w6[1], selector); w7[2] = __byte_perm_S (w5[3], w6[0], selector); w7[1] = __byte_perm_S (w5[2], w5[3], selector); w7[0] = __byte_perm_S (w5[1], w5[2], selector); w6[3] = __byte_perm_S (w5[0], w5[1], selector); w6[2] = __byte_perm_S (w4[3], w5[0], selector); w6[1] = __byte_perm_S (w4[2], w4[3], selector); w6[0] = __byte_perm_S (w4[1], w4[2], selector); w5[3] = __byte_perm_S (w4[0], w4[1], selector); w5[2] = __byte_perm_S (w3[3], w4[0], selector); w5[1] = __byte_perm_S (w3[2], w3[3], selector); w5[0] = __byte_perm_S (w3[1], w3[2], selector); w4[3] = __byte_perm_S (w3[0], w3[1], selector); w4[2] = __byte_perm_S (w2[3], w3[0], selector); w4[1] = __byte_perm_S (w2[2], w2[3], selector); w4[0] = __byte_perm_S (w2[1], w2[2], selector); w3[3] = __byte_perm_S (w2[0], w2[1], selector); w3[2] = __byte_perm_S (w1[3], w2[0], selector); w3[1] = __byte_perm_S (w1[2], w1[3], selector); w3[0] = __byte_perm_S (w1[1], w1[2], selector); w2[3] = __byte_perm_S (w1[0], w1[1], selector); w2[2] = __byte_perm_S (w0[3], w1[0], selector); w2[1] = __byte_perm_S (w0[2], w0[3], selector); w2[0] = __byte_perm_S (w0[1], w0[2], selector); w1[3] = __byte_perm_S (w0[0], w0[1], selector); w1[2] = __byte_perm_S ( 0, w0[0], selector); w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 7: w7[3] = __byte_perm_S (w5[3], w6[0], selector); w7[2] = __byte_perm_S (w5[2], w5[3], selector); w7[1] = __byte_perm_S (w5[1], w5[2], selector); w7[0] = __byte_perm_S (w5[0], w5[1], selector); w6[3] = __byte_perm_S (w4[3], w5[0], selector); w6[2] = __byte_perm_S (w4[2], w4[3], selector); w6[1] = __byte_perm_S (w4[1], w4[2], selector); w6[0] = __byte_perm_S (w4[0], w4[1], selector); w5[3] = __byte_perm_S (w3[3], w4[0], selector); w5[2] = __byte_perm_S (w3[2], w3[3], selector); w5[1] = __byte_perm_S (w3[1], w3[2], selector); w5[0] = __byte_perm_S (w3[0], w3[1], selector); w4[3] = __byte_perm_S (w2[3], w3[0], selector); w4[2] = __byte_perm_S (w2[2], w2[3], selector); w4[1] = __byte_perm_S (w2[1], w2[2], selector); w4[0] = __byte_perm_S (w2[0], w2[1], selector); w3[3] = __byte_perm_S (w1[3], w2[0], selector); w3[2] = __byte_perm_S (w1[2], w1[3], selector); w3[1] = __byte_perm_S (w1[1], w1[2], selector); w3[0] = __byte_perm_S (w1[0], w1[1], selector); w2[3] = __byte_perm_S (w0[3], w1[0], selector); w2[2] = __byte_perm_S (w0[2], w0[3], selector); w2[1] = __byte_perm_S (w0[1], w0[2], selector); w2[0] = __byte_perm_S (w0[0], w0[1], selector); w1[3] = __byte_perm_S ( 0, w0[0], selector); w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 8: w7[3] = __byte_perm_S (w5[2], w5[3], selector); w7[2] = __byte_perm_S (w5[1], w5[2], selector); w7[1] = __byte_perm_S (w5[0], w5[1], selector); w7[0] = __byte_perm_S (w4[3], w5[0], selector); w6[3] = __byte_perm_S (w4[2], w4[3], selector); w6[2] = __byte_perm_S (w4[1], w4[2], selector); w6[1] = __byte_perm_S (w4[0], w4[1], selector); w6[0] = __byte_perm_S (w3[3], w4[0], selector); w5[3] = __byte_perm_S (w3[2], w3[3], selector); w5[2] = __byte_perm_S (w3[1], w3[2], selector); w5[1] = __byte_perm_S (w3[0], w3[1], selector); w5[0] = __byte_perm_S (w2[3], w3[0], selector); w4[3] = __byte_perm_S (w2[2], w2[3], selector); w4[2] = __byte_perm_S (w2[1], w2[2], selector); w4[1] = __byte_perm_S (w2[0], w2[1], selector); w4[0] = __byte_perm_S (w1[3], w2[0], selector); w3[3] = __byte_perm_S (w1[2], w1[3], selector); w3[2] = __byte_perm_S (w1[1], w1[2], selector); w3[1] = __byte_perm_S (w1[0], w1[1], selector); w3[0] = __byte_perm_S (w0[3], w1[0], selector); w2[3] = __byte_perm_S (w0[2], w0[3], selector); w2[2] = __byte_perm_S (w0[1], w0[2], selector); w2[1] = __byte_perm_S (w0[0], w0[1], selector); w2[0] = __byte_perm_S ( 0, w0[0], selector); w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 9: w7[3] = __byte_perm_S (w5[1], w5[2], selector); w7[2] = __byte_perm_S (w5[0], w5[1], selector); w7[1] = __byte_perm_S (w4[3], w5[0], selector); w7[0] = __byte_perm_S (w4[2], w4[3], selector); w6[3] = __byte_perm_S (w4[1], w4[2], selector); w6[2] = __byte_perm_S (w4[0], w4[1], selector); w6[1] = __byte_perm_S (w3[3], w4[0], selector); w6[0] = __byte_perm_S (w3[2], w3[3], selector); w5[3] = __byte_perm_S (w3[1], w3[2], selector); w5[2] = __byte_perm_S (w3[0], w3[1], selector); w5[1] = __byte_perm_S (w2[3], w3[0], selector); w5[0] = __byte_perm_S (w2[2], w2[3], selector); w4[3] = __byte_perm_S (w2[1], w2[2], selector); w4[2] = __byte_perm_S (w2[0], w2[1], selector); w4[1] = __byte_perm_S (w1[3], w2[0], selector); w4[0] = __byte_perm_S (w1[2], w1[3], selector); w3[3] = __byte_perm_S (w1[1], w1[2], selector); w3[2] = __byte_perm_S (w1[0], w1[1], selector); w3[1] = __byte_perm_S (w0[3], w1[0], selector); w3[0] = __byte_perm_S (w0[2], w0[3], selector); w2[3] = __byte_perm_S (w0[1], w0[2], selector); w2[2] = __byte_perm_S (w0[0], w0[1], selector); w2[1] = __byte_perm_S ( 0, w0[0], selector); w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 10: w7[3] = __byte_perm_S (w5[0], w5[1], selector); w7[2] = __byte_perm_S (w4[3], w5[0], selector); w7[1] = __byte_perm_S (w4[2], w4[3], selector); w7[0] = __byte_perm_S (w4[1], w4[2], selector); w6[3] = __byte_perm_S (w4[0], w4[1], selector); w6[2] = __byte_perm_S (w3[3], w4[0], selector); w6[1] = __byte_perm_S (w3[2], w3[3], selector); w6[0] = __byte_perm_S (w3[1], w3[2], selector); w5[3] = __byte_perm_S (w3[0], w3[1], selector); w5[2] = __byte_perm_S (w2[3], w3[0], selector); w5[1] = __byte_perm_S (w2[2], w2[3], selector); w5[0] = __byte_perm_S (w2[1], w2[2], selector); w4[3] = __byte_perm_S (w2[0], w2[1], selector); w4[2] = __byte_perm_S (w1[3], w2[0], selector); w4[1] = __byte_perm_S (w1[2], w1[3], selector); w4[0] = __byte_perm_S (w1[1], w1[2], selector); w3[3] = __byte_perm_S (w1[0], w1[1], selector); w3[2] = __byte_perm_S (w0[3], w1[0], selector); w3[1] = __byte_perm_S (w0[2], w0[3], selector); w3[0] = __byte_perm_S (w0[1], w0[2], selector); w2[3] = __byte_perm_S (w0[0], w0[1], selector); w2[2] = __byte_perm_S ( 0, w0[0], selector); w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 11: w7[3] = __byte_perm_S (w4[3], w5[0], selector); w7[2] = __byte_perm_S (w4[2], w4[3], selector); w7[1] = __byte_perm_S (w4[1], w4[2], selector); w7[0] = __byte_perm_S (w4[0], w4[1], selector); w6[3] = __byte_perm_S (w3[3], w4[0], selector); w6[2] = __byte_perm_S (w3[2], w3[3], selector); w6[1] = __byte_perm_S (w3[1], w3[2], selector); w6[0] = __byte_perm_S (w3[0], w3[1], selector); w5[3] = __byte_perm_S (w2[3], w3[0], selector); w5[2] = __byte_perm_S (w2[2], w2[3], selector); w5[1] = __byte_perm_S (w2[1], w2[2], selector); w5[0] = __byte_perm_S (w2[0], w2[1], selector); w4[3] = __byte_perm_S (w1[3], w2[0], selector); w4[2] = __byte_perm_S (w1[2], w1[3], selector); w4[1] = __byte_perm_S (w1[1], w1[2], selector); w4[0] = __byte_perm_S (w1[0], w1[1], selector); w3[3] = __byte_perm_S (w0[3], w1[0], selector); w3[2] = __byte_perm_S (w0[2], w0[3], selector); w3[1] = __byte_perm_S (w0[1], w0[2], selector); w3[0] = __byte_perm_S (w0[0], w0[1], selector); w2[3] = __byte_perm_S ( 0, w0[0], selector); w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 12: w7[3] = __byte_perm_S (w4[2], w4[3], selector); w7[2] = __byte_perm_S (w4[1], w4[2], selector); w7[1] = __byte_perm_S (w4[0], w4[1], selector); w7[0] = __byte_perm_S (w3[3], w4[0], selector); w6[3] = __byte_perm_S (w3[2], w3[3], selector); w6[2] = __byte_perm_S (w3[1], w3[2], selector); w6[1] = __byte_perm_S (w3[0], w3[1], selector); w6[0] = __byte_perm_S (w2[3], w3[0], selector); w5[3] = __byte_perm_S (w2[2], w2[3], selector); w5[2] = __byte_perm_S (w2[1], w2[2], selector); w5[1] = __byte_perm_S (w2[0], w2[1], selector); w5[0] = __byte_perm_S (w1[3], w2[0], selector); w4[3] = __byte_perm_S (w1[2], w1[3], selector); w4[2] = __byte_perm_S (w1[1], w1[2], selector); w4[1] = __byte_perm_S (w1[0], w1[1], selector); w4[0] = __byte_perm_S (w0[3], w1[0], selector); w3[3] = __byte_perm_S (w0[2], w0[3], selector); w3[2] = __byte_perm_S (w0[1], w0[2], selector); w3[1] = __byte_perm_S (w0[0], w0[1], selector); w3[0] = __byte_perm_S ( 0, w0[0], selector); w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 13: w7[3] = __byte_perm_S (w4[1], w4[2], selector); w7[2] = __byte_perm_S (w4[0], w4[1], selector); w7[1] = __byte_perm_S (w3[3], w4[0], selector); w7[0] = __byte_perm_S (w3[2], w3[3], selector); w6[3] = __byte_perm_S (w3[1], w3[2], selector); w6[2] = __byte_perm_S (w3[0], w3[1], selector); w6[1] = __byte_perm_S (w2[3], w3[0], selector); w6[0] = __byte_perm_S (w2[2], w2[3], selector); w5[3] = __byte_perm_S (w2[1], w2[2], selector); w5[2] = __byte_perm_S (w2[0], w2[1], selector); w5[1] = __byte_perm_S (w1[3], w2[0], selector); w5[0] = __byte_perm_S (w1[2], w1[3], selector); w4[3] = __byte_perm_S (w1[1], w1[2], selector); w4[2] = __byte_perm_S (w1[0], w1[1], selector); w4[1] = __byte_perm_S (w0[3], w1[0], selector); w4[0] = __byte_perm_S (w0[2], w0[3], selector); w3[3] = __byte_perm_S (w0[1], w0[2], selector); w3[2] = __byte_perm_S (w0[0], w0[1], selector); w3[1] = __byte_perm_S ( 0, w0[0], selector); w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 14: w7[3] = __byte_perm_S (w4[0], w4[1], selector); w7[2] = __byte_perm_S (w3[3], w4[0], selector); w7[1] = __byte_perm_S (w3[2], w3[3], selector); w7[0] = __byte_perm_S (w3[1], w3[2], selector); w6[3] = __byte_perm_S (w3[0], w3[1], selector); w6[2] = __byte_perm_S (w2[3], w3[0], selector); w6[1] = __byte_perm_S (w2[2], w2[3], selector); w6[0] = __byte_perm_S (w2[1], w2[2], selector); w5[3] = __byte_perm_S (w2[0], w2[1], selector); w5[2] = __byte_perm_S (w1[3], w2[0], selector); w5[1] = __byte_perm_S (w1[2], w1[3], selector); w5[0] = __byte_perm_S (w1[1], w1[2], selector); w4[3] = __byte_perm_S (w1[0], w1[1], selector); w4[2] = __byte_perm_S (w0[3], w1[0], selector); w4[1] = __byte_perm_S (w0[2], w0[3], selector); w4[0] = __byte_perm_S (w0[1], w0[2], selector); w3[3] = __byte_perm_S (w0[0], w0[1], selector); w3[2] = __byte_perm_S ( 0, w0[0], selector); w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 15: w7[3] = __byte_perm_S (w3[3], w4[0], selector); w7[2] = __byte_perm_S (w3[2], w3[3], selector); w7[1] = __byte_perm_S (w3[1], w3[2], selector); w7[0] = __byte_perm_S (w3[0], w3[1], selector); w6[3] = __byte_perm_S (w2[3], w3[0], selector); w6[2] = __byte_perm_S (w2[2], w2[3], selector); w6[1] = __byte_perm_S (w2[1], w2[2], selector); w6[0] = __byte_perm_S (w2[0], w2[1], selector); w5[3] = __byte_perm_S (w1[3], w2[0], selector); w5[2] = __byte_perm_S (w1[2], w1[3], selector); w5[1] = __byte_perm_S (w1[1], w1[2], selector); w5[0] = __byte_perm_S (w1[0], w1[1], selector); w4[3] = __byte_perm_S (w0[3], w1[0], selector); w4[2] = __byte_perm_S (w0[2], w0[3], selector); w4[1] = __byte_perm_S (w0[1], w0[2], selector); w4[0] = __byte_perm_S (w0[0], w0[1], selector); w3[3] = __byte_perm_S ( 0, w0[0], selector); w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; } #endif } DECLSPEC void switch_buffer_by_offset_8x4_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], const u32 offset) { const int offset_switch = offset / 4; #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC switch (offset_switch) { case 0: w7[3] = amd_bytealign_S (w7[2], w7[3], offset); w7[2] = amd_bytealign_S (w7[1], w7[2], offset); w7[1] = amd_bytealign_S (w7[0], w7[1], offset); w7[0] = amd_bytealign_S (w6[3], w7[0], offset); w6[3] = amd_bytealign_S (w6[2], w6[3], offset); w6[2] = amd_bytealign_S (w6[1], w6[2], offset); w6[1] = amd_bytealign_S (w6[0], w6[1], offset); w6[0] = amd_bytealign_S (w5[3], w6[0], offset); w5[3] = amd_bytealign_S (w5[2], w5[3], offset); w5[2] = amd_bytealign_S (w5[1], w5[2], offset); w5[1] = amd_bytealign_S (w5[0], w5[1], offset); w5[0] = amd_bytealign_S (w4[3], w5[0], offset); w4[3] = amd_bytealign_S (w4[2], w4[3], offset); w4[2] = amd_bytealign_S (w4[1], w4[2], offset); w4[1] = amd_bytealign_S (w4[0], w4[1], offset); w4[0] = amd_bytealign_S (w3[3], w4[0], offset); w3[3] = amd_bytealign_S (w3[2], w3[3], offset); w3[2] = amd_bytealign_S (w3[1], w3[2], offset); w3[1] = amd_bytealign_S (w3[0], w3[1], offset); w3[0] = amd_bytealign_S (w2[3], w3[0], offset); w2[3] = amd_bytealign_S (w2[2], w2[3], offset); w2[2] = amd_bytealign_S (w2[1], w2[2], offset); w2[1] = amd_bytealign_S (w2[0], w2[1], offset); w2[0] = amd_bytealign_S (w1[3], w2[0], offset); w1[3] = amd_bytealign_S (w1[2], w1[3], offset); w1[2] = amd_bytealign_S (w1[1], w1[2], offset); w1[1] = amd_bytealign_S (w1[0], w1[1], offset); w1[0] = amd_bytealign_S (w0[3], w1[0], offset); w0[3] = amd_bytealign_S (w0[2], w0[3], offset); w0[2] = amd_bytealign_S (w0[1], w0[2], offset); w0[1] = amd_bytealign_S (w0[0], w0[1], offset); w0[0] = amd_bytealign_S ( 0, w0[0], offset); break; case 1: w7[3] = amd_bytealign_S (w7[1], w7[2], offset); w7[2] = amd_bytealign_S (w7[0], w7[1], offset); w7[1] = amd_bytealign_S (w6[3], w7[0], offset); w7[0] = amd_bytealign_S (w6[2], w6[3], offset); w6[3] = amd_bytealign_S (w6[1], w6[2], offset); w6[2] = amd_bytealign_S (w6[0], w6[1], offset); w6[1] = amd_bytealign_S (w5[3], w6[0], offset); w6[0] = amd_bytealign_S (w5[2], w5[3], offset); w5[3] = amd_bytealign_S (w5[1], w5[2], offset); w5[2] = amd_bytealign_S (w5[0], w5[1], offset); w5[1] = amd_bytealign_S (w4[3], w5[0], offset); w5[0] = amd_bytealign_S (w4[2], w4[3], offset); w4[3] = amd_bytealign_S (w4[1], w4[2], offset); w4[2] = amd_bytealign_S (w4[0], w4[1], offset); w4[1] = amd_bytealign_S (w3[3], w4[0], offset); w4[0] = amd_bytealign_S (w3[2], w3[3], offset); w3[3] = amd_bytealign_S (w3[1], w3[2], offset); w3[2] = amd_bytealign_S (w3[0], w3[1], offset); w3[1] = amd_bytealign_S (w2[3], w3[0], offset); w3[0] = amd_bytealign_S (w2[2], w2[3], offset); w2[3] = amd_bytealign_S (w2[1], w2[2], offset); w2[2] = amd_bytealign_S (w2[0], w2[1], offset); w2[1] = amd_bytealign_S (w1[3], w2[0], offset); w2[0] = amd_bytealign_S (w1[2], w1[3], offset); w1[3] = amd_bytealign_S (w1[1], w1[2], offset); w1[2] = amd_bytealign_S (w1[0], w1[1], offset); w1[1] = amd_bytealign_S (w0[3], w1[0], offset); w1[0] = amd_bytealign_S (w0[2], w0[3], offset); w0[3] = amd_bytealign_S (w0[1], w0[2], offset); w0[2] = amd_bytealign_S (w0[0], w0[1], offset); w0[1] = amd_bytealign_S ( 0, w0[0], offset); w0[0] = 0; break; case 2: w7[3] = amd_bytealign_S (w7[0], w7[1], offset); w7[2] = amd_bytealign_S (w6[3], w7[0], offset); w7[1] = amd_bytealign_S (w6[2], w6[3], offset); w7[0] = amd_bytealign_S (w6[1], w6[2], offset); w6[3] = amd_bytealign_S (w6[0], w6[1], offset); w6[2] = amd_bytealign_S (w5[3], w6[0], offset); w6[1] = amd_bytealign_S (w5[2], w5[3], offset); w6[0] = amd_bytealign_S (w5[1], w5[2], offset); w5[3] = amd_bytealign_S (w5[0], w5[1], offset); w5[2] = amd_bytealign_S (w4[3], w5[0], offset); w5[1] = amd_bytealign_S (w4[2], w4[3], offset); w5[0] = amd_bytealign_S (w4[1], w4[2], offset); w4[3] = amd_bytealign_S (w4[0], w4[1], offset); w4[2] = amd_bytealign_S (w3[3], w4[0], offset); w4[1] = amd_bytealign_S (w3[2], w3[3], offset); w4[0] = amd_bytealign_S (w3[1], w3[2], offset); w3[3] = amd_bytealign_S (w3[0], w3[1], offset); w3[2] = amd_bytealign_S (w2[3], w3[0], offset); w3[1] = amd_bytealign_S (w2[2], w2[3], offset); w3[0] = amd_bytealign_S (w2[1], w2[2], offset); w2[3] = amd_bytealign_S (w2[0], w2[1], offset); w2[2] = amd_bytealign_S (w1[3], w2[0], offset); w2[1] = amd_bytealign_S (w1[2], w1[3], offset); w2[0] = amd_bytealign_S (w1[1], w1[2], offset); w1[3] = amd_bytealign_S (w1[0], w1[1], offset); w1[2] = amd_bytealign_S (w0[3], w1[0], offset); w1[1] = amd_bytealign_S (w0[2], w0[3], offset); w1[0] = amd_bytealign_S (w0[1], w0[2], offset); w0[3] = amd_bytealign_S (w0[0], w0[1], offset); w0[2] = amd_bytealign_S ( 0, w0[0], offset); w0[1] = 0; w0[0] = 0; break; case 3: w7[3] = amd_bytealign_S (w6[3], w7[0], offset); w7[2] = amd_bytealign_S (w6[2], w6[3], offset); w7[1] = amd_bytealign_S (w6[1], w6[2], offset); w7[0] = amd_bytealign_S (w6[0], w6[1], offset); w6[3] = amd_bytealign_S (w5[3], w6[0], offset); w6[2] = amd_bytealign_S (w5[2], w5[3], offset); w6[1] = amd_bytealign_S (w5[1], w5[2], offset); w6[0] = amd_bytealign_S (w5[0], w5[1], offset); w5[3] = amd_bytealign_S (w4[3], w5[0], offset); w5[2] = amd_bytealign_S (w4[2], w4[3], offset); w5[1] = amd_bytealign_S (w4[1], w4[2], offset); w5[0] = amd_bytealign_S (w4[0], w4[1], offset); w4[3] = amd_bytealign_S (w3[3], w4[0], offset); w4[2] = amd_bytealign_S (w3[2], w3[3], offset); w4[1] = amd_bytealign_S (w3[1], w3[2], offset); w4[0] = amd_bytealign_S (w3[0], w3[1], offset); w3[3] = amd_bytealign_S (w2[3], w3[0], offset); w3[2] = amd_bytealign_S (w2[2], w2[3], offset); w3[1] = amd_bytealign_S (w2[1], w2[2], offset); w3[0] = amd_bytealign_S (w2[0], w2[1], offset); w2[3] = amd_bytealign_S (w1[3], w2[0], offset); w2[2] = amd_bytealign_S (w1[2], w1[3], offset); w2[1] = amd_bytealign_S (w1[1], w1[2], offset); w2[0] = amd_bytealign_S (w1[0], w1[1], offset); w1[3] = amd_bytealign_S (w0[3], w1[0], offset); w1[2] = amd_bytealign_S (w0[2], w0[3], offset); w1[1] = amd_bytealign_S (w0[1], w0[2], offset); w1[0] = amd_bytealign_S (w0[0], w0[1], offset); w0[3] = amd_bytealign_S ( 0, w0[0], offset); w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 4: w7[3] = amd_bytealign_S (w6[2], w6[3], offset); w7[2] = amd_bytealign_S (w6[1], w6[2], offset); w7[1] = amd_bytealign_S (w6[0], w6[1], offset); w7[0] = amd_bytealign_S (w5[3], w6[0], offset); w6[3] = amd_bytealign_S (w5[2], w5[3], offset); w6[2] = amd_bytealign_S (w5[1], w5[2], offset); w6[1] = amd_bytealign_S (w5[0], w5[1], offset); w6[0] = amd_bytealign_S (w4[3], w5[0], offset); w5[3] = amd_bytealign_S (w4[2], w4[3], offset); w5[2] = amd_bytealign_S (w4[1], w4[2], offset); w5[1] = amd_bytealign_S (w4[0], w4[1], offset); w5[0] = amd_bytealign_S (w3[3], w4[0], offset); w4[3] = amd_bytealign_S (w3[2], w3[3], offset); w4[2] = amd_bytealign_S (w3[1], w3[2], offset); w4[1] = amd_bytealign_S (w3[0], w3[1], offset); w4[0] = amd_bytealign_S (w2[3], w3[0], offset); w3[3] = amd_bytealign_S (w2[2], w2[3], offset); w3[2] = amd_bytealign_S (w2[1], w2[2], offset); w3[1] = amd_bytealign_S (w2[0], w2[1], offset); w3[0] = amd_bytealign_S (w1[3], w2[0], offset); w2[3] = amd_bytealign_S (w1[2], w1[3], offset); w2[2] = amd_bytealign_S (w1[1], w1[2], offset); w2[1] = amd_bytealign_S (w1[0], w1[1], offset); w2[0] = amd_bytealign_S (w0[3], w1[0], offset); w1[3] = amd_bytealign_S (w0[2], w0[3], offset); w1[2] = amd_bytealign_S (w0[1], w0[2], offset); w1[1] = amd_bytealign_S (w0[0], w0[1], offset); w1[0] = amd_bytealign_S ( 0, w0[0], offset); w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 5: w7[3] = amd_bytealign_S (w6[1], w6[2], offset); w7[2] = amd_bytealign_S (w6[0], w6[1], offset); w7[1] = amd_bytealign_S (w5[3], w6[0], offset); w7[0] = amd_bytealign_S (w5[2], w5[3], offset); w6[3] = amd_bytealign_S (w5[1], w5[2], offset); w6[2] = amd_bytealign_S (w5[0], w5[1], offset); w6[1] = amd_bytealign_S (w4[3], w5[0], offset); w6[0] = amd_bytealign_S (w4[2], w4[3], offset); w5[3] = amd_bytealign_S (w4[1], w4[2], offset); w5[2] = amd_bytealign_S (w4[0], w4[1], offset); w5[1] = amd_bytealign_S (w3[3], w4[0], offset); w5[0] = amd_bytealign_S (w3[2], w3[3], offset); w4[3] = amd_bytealign_S (w3[1], w3[2], offset); w4[2] = amd_bytealign_S (w3[0], w3[1], offset); w4[1] = amd_bytealign_S (w2[3], w3[0], offset); w4[0] = amd_bytealign_S (w2[2], w2[3], offset); w3[3] = amd_bytealign_S (w2[1], w2[2], offset); w3[2] = amd_bytealign_S (w2[0], w2[1], offset); w3[1] = amd_bytealign_S (w1[3], w2[0], offset); w3[0] = amd_bytealign_S (w1[2], w1[3], offset); w2[3] = amd_bytealign_S (w1[1], w1[2], offset); w2[2] = amd_bytealign_S (w1[0], w1[1], offset); w2[1] = amd_bytealign_S (w0[3], w1[0], offset); w2[0] = amd_bytealign_S (w0[2], w0[3], offset); w1[3] = amd_bytealign_S (w0[1], w0[2], offset); w1[2] = amd_bytealign_S (w0[0], w0[1], offset); w1[1] = amd_bytealign_S ( 0, w0[0], offset); w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 6: w7[3] = amd_bytealign_S (w6[0], w6[1], offset); w7[2] = amd_bytealign_S (w5[3], w6[0], offset); w7[1] = amd_bytealign_S (w5[2], w5[3], offset); w7[0] = amd_bytealign_S (w5[1], w5[2], offset); w6[3] = amd_bytealign_S (w5[0], w5[1], offset); w6[2] = amd_bytealign_S (w4[3], w5[0], offset); w6[1] = amd_bytealign_S (w4[2], w4[3], offset); w6[0] = amd_bytealign_S (w4[1], w4[2], offset); w5[3] = amd_bytealign_S (w4[0], w4[1], offset); w5[2] = amd_bytealign_S (w3[3], w4[0], offset); w5[1] = amd_bytealign_S (w3[2], w3[3], offset); w5[0] = amd_bytealign_S (w3[1], w3[2], offset); w4[3] = amd_bytealign_S (w3[0], w3[1], offset); w4[2] = amd_bytealign_S (w2[3], w3[0], offset); w4[1] = amd_bytealign_S (w2[2], w2[3], offset); w4[0] = amd_bytealign_S (w2[1], w2[2], offset); w3[3] = amd_bytealign_S (w2[0], w2[1], offset); w3[2] = amd_bytealign_S (w1[3], w2[0], offset); w3[1] = amd_bytealign_S (w1[2], w1[3], offset); w3[0] = amd_bytealign_S (w1[1], w1[2], offset); w2[3] = amd_bytealign_S (w1[0], w1[1], offset); w2[2] = amd_bytealign_S (w0[3], w1[0], offset); w2[1] = amd_bytealign_S (w0[2], w0[3], offset); w2[0] = amd_bytealign_S (w0[1], w0[2], offset); w1[3] = amd_bytealign_S (w0[0], w0[1], offset); w1[2] = amd_bytealign_S ( 0, w0[0], offset); w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 7: w7[3] = amd_bytealign_S (w5[3], w6[0], offset); w7[2] = amd_bytealign_S (w5[2], w5[3], offset); w7[1] = amd_bytealign_S (w5[1], w5[2], offset); w7[0] = amd_bytealign_S (w5[0], w5[1], offset); w6[3] = amd_bytealign_S (w4[3], w5[0], offset); w6[2] = amd_bytealign_S (w4[2], w4[3], offset); w6[1] = amd_bytealign_S (w4[1], w4[2], offset); w6[0] = amd_bytealign_S (w4[0], w4[1], offset); w5[3] = amd_bytealign_S (w3[3], w4[0], offset); w5[2] = amd_bytealign_S (w3[2], w3[3], offset); w5[1] = amd_bytealign_S (w3[1], w3[2], offset); w5[0] = amd_bytealign_S (w3[0], w3[1], offset); w4[3] = amd_bytealign_S (w2[3], w3[0], offset); w4[2] = amd_bytealign_S (w2[2], w2[3], offset); w4[1] = amd_bytealign_S (w2[1], w2[2], offset); w4[0] = amd_bytealign_S (w2[0], w2[1], offset); w3[3] = amd_bytealign_S (w1[3], w2[0], offset); w3[2] = amd_bytealign_S (w1[2], w1[3], offset); w3[1] = amd_bytealign_S (w1[1], w1[2], offset); w3[0] = amd_bytealign_S (w1[0], w1[1], offset); w2[3] = amd_bytealign_S (w0[3], w1[0], offset); w2[2] = amd_bytealign_S (w0[2], w0[3], offset); w2[1] = amd_bytealign_S (w0[1], w0[2], offset); w2[0] = amd_bytealign_S (w0[0], w0[1], offset); w1[3] = amd_bytealign_S ( 0, w0[0], offset); w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 8: w7[3] = amd_bytealign_S (w5[2], w5[3], offset); w7[2] = amd_bytealign_S (w5[1], w5[2], offset); w7[1] = amd_bytealign_S (w5[0], w5[1], offset); w7[0] = amd_bytealign_S (w4[3], w5[0], offset); w6[3] = amd_bytealign_S (w4[2], w4[3], offset); w6[2] = amd_bytealign_S (w4[1], w4[2], offset); w6[1] = amd_bytealign_S (w4[0], w4[1], offset); w6[0] = amd_bytealign_S (w3[3], w4[0], offset); w5[3] = amd_bytealign_S (w3[2], w3[3], offset); w5[2] = amd_bytealign_S (w3[1], w3[2], offset); w5[1] = amd_bytealign_S (w3[0], w3[1], offset); w5[0] = amd_bytealign_S (w2[3], w3[0], offset); w4[3] = amd_bytealign_S (w2[2], w2[3], offset); w4[2] = amd_bytealign_S (w2[1], w2[2], offset); w4[1] = amd_bytealign_S (w2[0], w2[1], offset); w4[0] = amd_bytealign_S (w1[3], w2[0], offset); w3[3] = amd_bytealign_S (w1[2], w1[3], offset); w3[2] = amd_bytealign_S (w1[1], w1[2], offset); w3[1] = amd_bytealign_S (w1[0], w1[1], offset); w3[0] = amd_bytealign_S (w0[3], w1[0], offset); w2[3] = amd_bytealign_S (w0[2], w0[3], offset); w2[2] = amd_bytealign_S (w0[1], w0[2], offset); w2[1] = amd_bytealign_S (w0[0], w0[1], offset); w2[0] = amd_bytealign_S ( 0, w0[0], offset); w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 9: w7[3] = amd_bytealign_S (w5[1], w5[2], offset); w7[2] = amd_bytealign_S (w5[0], w5[1], offset); w7[1] = amd_bytealign_S (w4[3], w5[0], offset); w7[0] = amd_bytealign_S (w4[2], w4[3], offset); w6[3] = amd_bytealign_S (w4[1], w4[2], offset); w6[2] = amd_bytealign_S (w4[0], w4[1], offset); w6[1] = amd_bytealign_S (w3[3], w4[0], offset); w6[0] = amd_bytealign_S (w3[2], w3[3], offset); w5[3] = amd_bytealign_S (w3[1], w3[2], offset); w5[2] = amd_bytealign_S (w3[0], w3[1], offset); w5[1] = amd_bytealign_S (w2[3], w3[0], offset); w5[0] = amd_bytealign_S (w2[2], w2[3], offset); w4[3] = amd_bytealign_S (w2[1], w2[2], offset); w4[2] = amd_bytealign_S (w2[0], w2[1], offset); w4[1] = amd_bytealign_S (w1[3], w2[0], offset); w4[0] = amd_bytealign_S (w1[2], w1[3], offset); w3[3] = amd_bytealign_S (w1[1], w1[2], offset); w3[2] = amd_bytealign_S (w1[0], w1[1], offset); w3[1] = amd_bytealign_S (w0[3], w1[0], offset); w3[0] = amd_bytealign_S (w0[2], w0[3], offset); w2[3] = amd_bytealign_S (w0[1], w0[2], offset); w2[2] = amd_bytealign_S (w0[0], w0[1], offset); w2[1] = amd_bytealign_S ( 0, w0[0], offset); w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 10: w7[3] = amd_bytealign_S (w5[0], w5[1], offset); w7[2] = amd_bytealign_S (w4[3], w5[0], offset); w7[1] = amd_bytealign_S (w4[2], w4[3], offset); w7[0] = amd_bytealign_S (w4[1], w4[2], offset); w6[3] = amd_bytealign_S (w4[0], w4[1], offset); w6[2] = amd_bytealign_S (w3[3], w4[0], offset); w6[1] = amd_bytealign_S (w3[2], w3[3], offset); w6[0] = amd_bytealign_S (w3[1], w3[2], offset); w5[3] = amd_bytealign_S (w3[0], w3[1], offset); w5[2] = amd_bytealign_S (w2[3], w3[0], offset); w5[1] = amd_bytealign_S (w2[2], w2[3], offset); w5[0] = amd_bytealign_S (w2[1], w2[2], offset); w4[3] = amd_bytealign_S (w2[0], w2[1], offset); w4[2] = amd_bytealign_S (w1[3], w2[0], offset); w4[1] = amd_bytealign_S (w1[2], w1[3], offset); w4[0] = amd_bytealign_S (w1[1], w1[2], offset); w3[3] = amd_bytealign_S (w1[0], w1[1], offset); w3[2] = amd_bytealign_S (w0[3], w1[0], offset); w3[1] = amd_bytealign_S (w0[2], w0[3], offset); w3[0] = amd_bytealign_S (w0[1], w0[2], offset); w2[3] = amd_bytealign_S (w0[0], w0[1], offset); w2[2] = amd_bytealign_S ( 0, w0[0], offset); w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 11: w7[3] = amd_bytealign_S (w4[3], w5[0], offset); w7[2] = amd_bytealign_S (w4[2], w4[3], offset); w7[1] = amd_bytealign_S (w4[1], w4[2], offset); w7[0] = amd_bytealign_S (w4[0], w4[1], offset); w6[3] = amd_bytealign_S (w3[3], w4[0], offset); w6[2] = amd_bytealign_S (w3[2], w3[3], offset); w6[1] = amd_bytealign_S (w3[1], w3[2], offset); w6[0] = amd_bytealign_S (w3[0], w3[1], offset); w5[3] = amd_bytealign_S (w2[3], w3[0], offset); w5[2] = amd_bytealign_S (w2[2], w2[3], offset); w5[1] = amd_bytealign_S (w2[1], w2[2], offset); w5[0] = amd_bytealign_S (w2[0], w2[1], offset); w4[3] = amd_bytealign_S (w1[3], w2[0], offset); w4[2] = amd_bytealign_S (w1[2], w1[3], offset); w4[1] = amd_bytealign_S (w1[1], w1[2], offset); w4[0] = amd_bytealign_S (w1[0], w1[1], offset); w3[3] = amd_bytealign_S (w0[3], w1[0], offset); w3[2] = amd_bytealign_S (w0[2], w0[3], offset); w3[1] = amd_bytealign_S (w0[1], w0[2], offset); w3[0] = amd_bytealign_S (w0[0], w0[1], offset); w2[3] = amd_bytealign_S ( 0, w0[0], offset); w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 12: w7[3] = amd_bytealign_S (w4[2], w4[3], offset); w7[2] = amd_bytealign_S (w4[1], w4[2], offset); w7[1] = amd_bytealign_S (w4[0], w4[1], offset); w7[0] = amd_bytealign_S (w3[3], w4[0], offset); w6[3] = amd_bytealign_S (w3[2], w3[3], offset); w6[2] = amd_bytealign_S (w3[1], w3[2], offset); w6[1] = amd_bytealign_S (w3[0], w3[1], offset); w6[0] = amd_bytealign_S (w2[3], w3[0], offset); w5[3] = amd_bytealign_S (w2[2], w2[3], offset); w5[2] = amd_bytealign_S (w2[1], w2[2], offset); w5[1] = amd_bytealign_S (w2[0], w2[1], offset); w5[0] = amd_bytealign_S (w1[3], w2[0], offset); w4[3] = amd_bytealign_S (w1[2], w1[3], offset); w4[2] = amd_bytealign_S (w1[1], w1[2], offset); w4[1] = amd_bytealign_S (w1[0], w1[1], offset); w4[0] = amd_bytealign_S (w0[3], w1[0], offset); w3[3] = amd_bytealign_S (w0[2], w0[3], offset); w3[2] = amd_bytealign_S (w0[1], w0[2], offset); w3[1] = amd_bytealign_S (w0[0], w0[1], offset); w3[0] = amd_bytealign_S ( 0, w0[0], offset); w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 13: w7[3] = amd_bytealign_S (w4[1], w4[2], offset); w7[2] = amd_bytealign_S (w4[0], w4[1], offset); w7[1] = amd_bytealign_S (w3[3], w4[0], offset); w7[0] = amd_bytealign_S (w3[2], w3[3], offset); w6[3] = amd_bytealign_S (w3[1], w3[2], offset); w6[2] = amd_bytealign_S (w3[0], w3[1], offset); w6[1] = amd_bytealign_S (w2[3], w3[0], offset); w6[0] = amd_bytealign_S (w2[2], w2[3], offset); w5[3] = amd_bytealign_S (w2[1], w2[2], offset); w5[2] = amd_bytealign_S (w2[0], w2[1], offset); w5[1] = amd_bytealign_S (w1[3], w2[0], offset); w5[0] = amd_bytealign_S (w1[2], w1[3], offset); w4[3] = amd_bytealign_S (w1[1], w1[2], offset); w4[2] = amd_bytealign_S (w1[0], w1[1], offset); w4[1] = amd_bytealign_S (w0[3], w1[0], offset); w4[0] = amd_bytealign_S (w0[2], w0[3], offset); w3[3] = amd_bytealign_S (w0[1], w0[2], offset); w3[2] = amd_bytealign_S (w0[0], w0[1], offset); w3[1] = amd_bytealign_S ( 0, w0[0], offset); w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 14: w7[3] = amd_bytealign_S (w4[0], w4[1], offset); w7[2] = amd_bytealign_S (w3[3], w4[0], offset); w7[1] = amd_bytealign_S (w3[2], w3[3], offset); w7[0] = amd_bytealign_S (w3[1], w3[2], offset); w6[3] = amd_bytealign_S (w3[0], w3[1], offset); w6[2] = amd_bytealign_S (w2[3], w3[0], offset); w6[1] = amd_bytealign_S (w2[2], w2[3], offset); w6[0] = amd_bytealign_S (w2[1], w2[2], offset); w5[3] = amd_bytealign_S (w2[0], w2[1], offset); w5[2] = amd_bytealign_S (w1[3], w2[0], offset); w5[1] = amd_bytealign_S (w1[2], w1[3], offset); w5[0] = amd_bytealign_S (w1[1], w1[2], offset); w4[3] = amd_bytealign_S (w1[0], w1[1], offset); w4[2] = amd_bytealign_S (w0[3], w1[0], offset); w4[1] = amd_bytealign_S (w0[2], w0[3], offset); w4[0] = amd_bytealign_S (w0[1], w0[2], offset); w3[3] = amd_bytealign_S (w0[0], w0[1], offset); w3[2] = amd_bytealign_S ( 0, w0[0], offset); w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 15: w7[3] = amd_bytealign_S (w3[3], w4[0], offset); w7[2] = amd_bytealign_S (w3[2], w3[3], offset); w7[1] = amd_bytealign_S (w3[1], w3[2], offset); w7[0] = amd_bytealign_S (w3[0], w3[1], offset); w6[3] = amd_bytealign_S (w2[3], w3[0], offset); w6[2] = amd_bytealign_S (w2[2], w2[3], offset); w6[1] = amd_bytealign_S (w2[1], w2[2], offset); w6[0] = amd_bytealign_S (w2[0], w2[1], offset); w5[3] = amd_bytealign_S (w1[3], w2[0], offset); w5[2] = amd_bytealign_S (w1[2], w1[3], offset); w5[1] = amd_bytealign_S (w1[1], w1[2], offset); w5[0] = amd_bytealign_S (w1[0], w1[1], offset); w4[3] = amd_bytealign_S (w0[3], w1[0], offset); w4[2] = amd_bytealign_S (w0[2], w0[3], offset); w4[1] = amd_bytealign_S (w0[1], w0[2], offset); w4[0] = amd_bytealign_S (w0[0], w0[1], offset); w3[3] = amd_bytealign_S ( 0, w0[0], offset); w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 16: w7[3] = amd_bytealign_S (w3[2], w3[3], offset); w7[2] = amd_bytealign_S (w3[1], w3[2], offset); w7[1] = amd_bytealign_S (w3[0], w3[1], offset); w7[0] = amd_bytealign_S (w2[3], w3[0], offset); w6[3] = amd_bytealign_S (w2[2], w2[3], offset); w6[2] = amd_bytealign_S (w2[1], w2[2], offset); w6[1] = amd_bytealign_S (w2[0], w2[1], offset); w6[0] = amd_bytealign_S (w1[3], w2[0], offset); w5[3] = amd_bytealign_S (w1[2], w1[3], offset); w5[2] = amd_bytealign_S (w1[1], w1[2], offset); w5[1] = amd_bytealign_S (w1[0], w1[1], offset); w5[0] = amd_bytealign_S (w0[3], w1[0], offset); w4[3] = amd_bytealign_S (w0[2], w0[3], offset); w4[2] = amd_bytealign_S (w0[1], w0[2], offset); w4[1] = amd_bytealign_S (w0[0], w0[1], offset); w4[0] = amd_bytealign_S ( 0, w0[0], offset); w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 17: w7[3] = amd_bytealign_S (w3[1], w3[2], offset); w7[2] = amd_bytealign_S (w3[0], w3[1], offset); w7[1] = amd_bytealign_S (w2[3], w3[0], offset); w7[0] = amd_bytealign_S (w2[2], w2[3], offset); w6[3] = amd_bytealign_S (w2[1], w2[2], offset); w6[2] = amd_bytealign_S (w2[0], w2[1], offset); w6[1] = amd_bytealign_S (w1[3], w2[0], offset); w6[0] = amd_bytealign_S (w1[2], w1[3], offset); w5[3] = amd_bytealign_S (w1[1], w1[2], offset); w5[2] = amd_bytealign_S (w1[0], w1[1], offset); w5[1] = amd_bytealign_S (w0[3], w1[0], offset); w5[0] = amd_bytealign_S (w0[2], w0[3], offset); w4[3] = amd_bytealign_S (w0[1], w0[2], offset); w4[2] = amd_bytealign_S (w0[0], w0[1], offset); w4[1] = amd_bytealign_S ( 0, w0[0], offset); w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 18: w7[3] = amd_bytealign_S (w3[0], w3[1], offset); w7[2] = amd_bytealign_S (w2[3], w3[0], offset); w7[1] = amd_bytealign_S (w2[2], w2[3], offset); w7[0] = amd_bytealign_S (w2[1], w2[2], offset); w6[3] = amd_bytealign_S (w2[0], w2[1], offset); w6[2] = amd_bytealign_S (w1[3], w2[0], offset); w6[1] = amd_bytealign_S (w1[2], w1[3], offset); w6[0] = amd_bytealign_S (w1[1], w1[2], offset); w5[3] = amd_bytealign_S (w1[0], w1[1], offset); w5[2] = amd_bytealign_S (w0[3], w1[0], offset); w5[1] = amd_bytealign_S (w0[2], w0[3], offset); w5[0] = amd_bytealign_S (w0[1], w0[2], offset); w4[3] = amd_bytealign_S (w0[0], w0[1], offset); w4[2] = amd_bytealign_S ( 0, w0[0], offset); w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 19: w7[3] = amd_bytealign_S (w2[3], w3[0], offset); w7[2] = amd_bytealign_S (w2[2], w2[3], offset); w7[1] = amd_bytealign_S (w2[1], w2[2], offset); w7[0] = amd_bytealign_S (w2[0], w2[1], offset); w6[3] = amd_bytealign_S (w1[3], w2[0], offset); w6[2] = amd_bytealign_S (w1[2], w1[3], offset); w6[1] = amd_bytealign_S (w1[1], w1[2], offset); w6[0] = amd_bytealign_S (w1[0], w1[1], offset); w5[3] = amd_bytealign_S (w0[3], w1[0], offset); w5[2] = amd_bytealign_S (w0[2], w0[3], offset); w5[1] = amd_bytealign_S (w0[1], w0[2], offset); w5[0] = amd_bytealign_S (w0[0], w0[1], offset); w4[3] = amd_bytealign_S ( 0, w0[0], offset); w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 20: w7[3] = amd_bytealign_S (w2[2], w2[3], offset); w7[2] = amd_bytealign_S (w2[1], w2[2], offset); w7[1] = amd_bytealign_S (w2[0], w2[1], offset); w7[0] = amd_bytealign_S (w1[3], w2[0], offset); w6[3] = amd_bytealign_S (w1[2], w1[3], offset); w6[2] = amd_bytealign_S (w1[1], w1[2], offset); w6[1] = amd_bytealign_S (w1[0], w1[1], offset); w6[0] = amd_bytealign_S (w0[3], w1[0], offset); w5[3] = amd_bytealign_S (w0[2], w0[3], offset); w5[2] = amd_bytealign_S (w0[1], w0[2], offset); w5[1] = amd_bytealign_S (w0[0], w0[1], offset); w5[0] = amd_bytealign_S ( 0, w0[0], offset); w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 21: w7[3] = amd_bytealign_S (w2[1], w2[2], offset); w7[2] = amd_bytealign_S (w2[0], w2[1], offset); w7[1] = amd_bytealign_S (w1[3], w2[0], offset); w7[0] = amd_bytealign_S (w1[2], w1[3], offset); w6[3] = amd_bytealign_S (w1[1], w1[2], offset); w6[2] = amd_bytealign_S (w1[0], w1[1], offset); w6[1] = amd_bytealign_S (w0[3], w1[0], offset); w6[0] = amd_bytealign_S (w0[2], w0[3], offset); w5[3] = amd_bytealign_S (w0[1], w0[2], offset); w5[2] = amd_bytealign_S (w0[0], w0[1], offset); w5[1] = amd_bytealign_S ( 0, w0[0], offset); w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 22: w7[3] = amd_bytealign_S (w2[0], w2[1], offset); w7[2] = amd_bytealign_S (w1[3], w2[0], offset); w7[1] = amd_bytealign_S (w1[2], w1[3], offset); w7[0] = amd_bytealign_S (w1[1], w1[2], offset); w6[3] = amd_bytealign_S (w1[0], w1[1], offset); w6[2] = amd_bytealign_S (w0[3], w1[0], offset); w6[1] = amd_bytealign_S (w0[2], w0[3], offset); w6[0] = amd_bytealign_S (w0[1], w0[2], offset); w5[3] = amd_bytealign_S (w0[0], w0[1], offset); w5[2] = amd_bytealign_S ( 0, w0[0], offset); w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 23: w7[3] = amd_bytealign_S (w1[3], w2[0], offset); w7[2] = amd_bytealign_S (w1[2], w1[3], offset); w7[1] = amd_bytealign_S (w1[1], w1[2], offset); w7[0] = amd_bytealign_S (w1[0], w1[1], offset); w6[3] = amd_bytealign_S (w0[3], w1[0], offset); w6[2] = amd_bytealign_S (w0[2], w0[3], offset); w6[1] = amd_bytealign_S (w0[1], w0[2], offset); w6[0] = amd_bytealign_S (w0[0], w0[1], offset); w5[3] = amd_bytealign_S ( 0, w0[0], offset); w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 24: w7[3] = amd_bytealign_S (w1[2], w1[3], offset); w7[2] = amd_bytealign_S (w1[1], w1[2], offset); w7[1] = amd_bytealign_S (w1[0], w1[1], offset); w7[0] = amd_bytealign_S (w0[3], w1[0], offset); w6[3] = amd_bytealign_S (w0[2], w0[3], offset); w6[2] = amd_bytealign_S (w0[1], w0[2], offset); w6[1] = amd_bytealign_S (w0[0], w0[1], offset); w6[0] = amd_bytealign_S ( 0, w0[0], offset); w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 25: w7[3] = amd_bytealign_S (w1[1], w1[2], offset); w7[2] = amd_bytealign_S (w1[0], w1[1], offset); w7[1] = amd_bytealign_S (w0[3], w1[0], offset); w7[0] = amd_bytealign_S (w0[2], w0[3], offset); w6[3] = amd_bytealign_S (w0[1], w0[2], offset); w6[2] = amd_bytealign_S (w0[0], w0[1], offset); w6[1] = amd_bytealign_S ( 0, w0[0], offset); w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 26: w7[3] = amd_bytealign_S (w1[0], w1[1], offset); w7[2] = amd_bytealign_S (w0[3], w1[0], offset); w7[1] = amd_bytealign_S (w0[2], w0[3], offset); w7[0] = amd_bytealign_S (w0[1], w0[2], offset); w6[3] = amd_bytealign_S (w0[0], w0[1], offset); w6[2] = amd_bytealign_S ( 0, w0[0], offset); w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 27: w7[3] = amd_bytealign_S (w0[3], w1[0], offset); w7[2] = amd_bytealign_S (w0[2], w0[3], offset); w7[1] = amd_bytealign_S (w0[1], w0[2], offset); w7[0] = amd_bytealign_S (w0[0], w0[1], offset); w6[3] = amd_bytealign_S ( 0, w0[0], offset); w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 28: w7[3] = amd_bytealign_S (w0[2], w0[3], offset); w7[2] = amd_bytealign_S (w0[1], w0[2], offset); w7[1] = amd_bytealign_S (w0[0], w0[1], offset); w7[0] = amd_bytealign_S ( 0, w0[0], offset); w6[3] = 0; w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 29: w7[3] = amd_bytealign_S (w0[1], w0[2], offset); w7[2] = amd_bytealign_S (w0[0], w0[1], offset); w7[1] = amd_bytealign_S ( 0, w0[0], offset); w7[0] = 0; w6[3] = 0; w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 30: w7[3] = amd_bytealign_S (w0[0], w0[1], offset); w7[2] = amd_bytealign_S ( 0, w0[0], offset); w7[1] = 0; w7[0] = 0; w6[3] = 0; w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 31: w7[3] = amd_bytealign_S ( 0, w0[0], offset); w7[2] = 0; w7[1] = 0; w7[0] = 0; w6[3] = 0; w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; } #endif #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV #if defined IS_NV const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; #endif #if defined IS_AMD const int selector = 0x0706050403020100 >> ((offset & 3) * 8); #endif switch (offset_switch) { case 0: w7[3] = __byte_perm_S (w7[3], w7[2], selector); w7[2] = __byte_perm_S (w7[2], w7[1], selector); w7[1] = __byte_perm_S (w7[1], w7[0], selector); w7[0] = __byte_perm_S (w7[0], w6[3], selector); w6[3] = __byte_perm_S (w6[3], w6[2], selector); w6[2] = __byte_perm_S (w6[2], w6[1], selector); w6[1] = __byte_perm_S (w6[1], w6[0], selector); w6[0] = __byte_perm_S (w6[0], w5[3], selector); w5[3] = __byte_perm_S (w5[3], w5[2], selector); w5[2] = __byte_perm_S (w5[2], w5[1], selector); w5[1] = __byte_perm_S (w5[1], w5[0], selector); w5[0] = __byte_perm_S (w5[0], w4[3], selector); w4[3] = __byte_perm_S (w4[3], w4[2], selector); w4[2] = __byte_perm_S (w4[2], w4[1], selector); w4[1] = __byte_perm_S (w4[1], w4[0], selector); w4[0] = __byte_perm_S (w4[0], w3[3], selector); w3[3] = __byte_perm_S (w3[3], w3[2], selector); w3[2] = __byte_perm_S (w3[2], w3[1], selector); w3[1] = __byte_perm_S (w3[1], w3[0], selector); w3[0] = __byte_perm_S (w3[0], w2[3], selector); w2[3] = __byte_perm_S (w2[3], w2[2], selector); w2[2] = __byte_perm_S (w2[2], w2[1], selector); w2[1] = __byte_perm_S (w2[1], w2[0], selector); w2[0] = __byte_perm_S (w2[0], w1[3], selector); w1[3] = __byte_perm_S (w1[3], w1[2], selector); w1[2] = __byte_perm_S (w1[2], w1[1], selector); w1[1] = __byte_perm_S (w1[1], w1[0], selector); w1[0] = __byte_perm_S (w1[0], w0[3], selector); w0[3] = __byte_perm_S (w0[3], w0[2], selector); w0[2] = __byte_perm_S (w0[2], w0[1], selector); w0[1] = __byte_perm_S (w0[1], w0[0], selector); w0[0] = __byte_perm_S (w0[0], 0, selector); break; case 1: w7[3] = __byte_perm_S (w7[2], w7[1], selector); w7[2] = __byte_perm_S (w7[1], w7[0], selector); w7[1] = __byte_perm_S (w7[0], w6[3], selector); w7[0] = __byte_perm_S (w6[3], w6[2], selector); w6[3] = __byte_perm_S (w6[2], w6[1], selector); w6[2] = __byte_perm_S (w6[1], w6[0], selector); w6[1] = __byte_perm_S (w6[0], w5[3], selector); w6[0] = __byte_perm_S (w5[3], w5[2], selector); w5[3] = __byte_perm_S (w5[2], w5[1], selector); w5[2] = __byte_perm_S (w5[1], w5[0], selector); w5[1] = __byte_perm_S (w5[0], w4[3], selector); w5[0] = __byte_perm_S (w4[3], w4[2], selector); w4[3] = __byte_perm_S (w4[2], w4[1], selector); w4[2] = __byte_perm_S (w4[1], w4[0], selector); w4[1] = __byte_perm_S (w4[0], w3[3], selector); w4[0] = __byte_perm_S (w3[3], w3[2], selector); w3[3] = __byte_perm_S (w3[2], w3[1], selector); w3[2] = __byte_perm_S (w3[1], w3[0], selector); w3[1] = __byte_perm_S (w3[0], w2[3], selector); w3[0] = __byte_perm_S (w2[3], w2[2], selector); w2[3] = __byte_perm_S (w2[2], w2[1], selector); w2[2] = __byte_perm_S (w2[1], w2[0], selector); w2[1] = __byte_perm_S (w2[0], w1[3], selector); w2[0] = __byte_perm_S (w1[3], w1[2], selector); w1[3] = __byte_perm_S (w1[2], w1[1], selector); w1[2] = __byte_perm_S (w1[1], w1[0], selector); w1[1] = __byte_perm_S (w1[0], w0[3], selector); w1[0] = __byte_perm_S (w0[3], w0[2], selector); w0[3] = __byte_perm_S (w0[2], w0[1], selector); w0[2] = __byte_perm_S (w0[1], w0[0], selector); w0[1] = __byte_perm_S (w0[0], 0, selector); w0[0] = 0; break; case 2: w7[3] = __byte_perm_S (w7[1], w7[0], selector); w7[2] = __byte_perm_S (w7[0], w6[3], selector); w7[1] = __byte_perm_S (w6[3], w6[2], selector); w7[0] = __byte_perm_S (w6[2], w6[1], selector); w6[3] = __byte_perm_S (w6[1], w6[0], selector); w6[2] = __byte_perm_S (w6[0], w5[3], selector); w6[1] = __byte_perm_S (w5[3], w5[2], selector); w6[0] = __byte_perm_S (w5[2], w5[1], selector); w5[3] = __byte_perm_S (w5[1], w5[0], selector); w5[2] = __byte_perm_S (w5[0], w4[3], selector); w5[1] = __byte_perm_S (w4[3], w4[2], selector); w5[0] = __byte_perm_S (w4[2], w4[1], selector); w4[3] = __byte_perm_S (w4[1], w4[0], selector); w4[2] = __byte_perm_S (w4[0], w3[3], selector); w4[1] = __byte_perm_S (w3[3], w3[2], selector); w4[0] = __byte_perm_S (w3[2], w3[1], selector); w3[3] = __byte_perm_S (w3[1], w3[0], selector); w3[2] = __byte_perm_S (w3[0], w2[3], selector); w3[1] = __byte_perm_S (w2[3], w2[2], selector); w3[0] = __byte_perm_S (w2[2], w2[1], selector); w2[3] = __byte_perm_S (w2[1], w2[0], selector); w2[2] = __byte_perm_S (w2[0], w1[3], selector); w2[1] = __byte_perm_S (w1[3], w1[2], selector); w2[0] = __byte_perm_S (w1[2], w1[1], selector); w1[3] = __byte_perm_S (w1[1], w1[0], selector); w1[2] = __byte_perm_S (w1[0], w0[3], selector); w1[1] = __byte_perm_S (w0[3], w0[2], selector); w1[0] = __byte_perm_S (w0[2], w0[1], selector); w0[3] = __byte_perm_S (w0[1], w0[0], selector); w0[2] = __byte_perm_S (w0[0], 0, selector); w0[1] = 0; w0[0] = 0; break; case 3: w7[3] = __byte_perm_S (w7[0], w6[3], selector); w7[2] = __byte_perm_S (w6[3], w6[2], selector); w7[1] = __byte_perm_S (w6[2], w6[1], selector); w7[0] = __byte_perm_S (w6[1], w6[0], selector); w6[3] = __byte_perm_S (w6[0], w5[3], selector); w6[2] = __byte_perm_S (w5[3], w5[2], selector); w6[1] = __byte_perm_S (w5[2], w5[1], selector); w6[0] = __byte_perm_S (w5[1], w5[0], selector); w5[3] = __byte_perm_S (w5[0], w4[3], selector); w5[2] = __byte_perm_S (w4[3], w4[2], selector); w5[1] = __byte_perm_S (w4[2], w4[1], selector); w5[0] = __byte_perm_S (w4[1], w4[0], selector); w4[3] = __byte_perm_S (w4[0], w3[3], selector); w4[2] = __byte_perm_S (w3[3], w3[2], selector); w4[1] = __byte_perm_S (w3[2], w3[1], selector); w4[0] = __byte_perm_S (w3[1], w3[0], selector); w3[3] = __byte_perm_S (w3[0], w2[3], selector); w3[2] = __byte_perm_S (w2[3], w2[2], selector); w3[1] = __byte_perm_S (w2[2], w2[1], selector); w3[0] = __byte_perm_S (w2[1], w2[0], selector); w2[3] = __byte_perm_S (w2[0], w1[3], selector); w2[2] = __byte_perm_S (w1[3], w1[2], selector); w2[1] = __byte_perm_S (w1[2], w1[1], selector); w2[0] = __byte_perm_S (w1[1], w1[0], selector); w1[3] = __byte_perm_S (w1[0], w0[3], selector); w1[2] = __byte_perm_S (w0[3], w0[2], selector); w1[1] = __byte_perm_S (w0[2], w0[1], selector); w1[0] = __byte_perm_S (w0[1], w0[0], selector); w0[3] = __byte_perm_S (w0[0], 0, selector); w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 4: w7[3] = __byte_perm_S (w6[3], w6[2], selector); w7[2] = __byte_perm_S (w6[2], w6[1], selector); w7[1] = __byte_perm_S (w6[1], w6[0], selector); w7[0] = __byte_perm_S (w6[0], w5[3], selector); w6[3] = __byte_perm_S (w5[3], w5[2], selector); w6[2] = __byte_perm_S (w5[2], w5[1], selector); w6[1] = __byte_perm_S (w5[1], w5[0], selector); w6[0] = __byte_perm_S (w5[0], w4[3], selector); w5[3] = __byte_perm_S (w4[3], w4[2], selector); w5[2] = __byte_perm_S (w4[2], w4[1], selector); w5[1] = __byte_perm_S (w4[1], w4[0], selector); w5[0] = __byte_perm_S (w4[0], w3[3], selector); w4[3] = __byte_perm_S (w3[3], w3[2], selector); w4[2] = __byte_perm_S (w3[2], w3[1], selector); w4[1] = __byte_perm_S (w3[1], w3[0], selector); w4[0] = __byte_perm_S (w3[0], w2[3], selector); w3[3] = __byte_perm_S (w2[3], w2[2], selector); w3[2] = __byte_perm_S (w2[2], w2[1], selector); w3[1] = __byte_perm_S (w2[1], w2[0], selector); w3[0] = __byte_perm_S (w2[0], w1[3], selector); w2[3] = __byte_perm_S (w1[3], w1[2], selector); w2[2] = __byte_perm_S (w1[2], w1[1], selector); w2[1] = __byte_perm_S (w1[1], w1[0], selector); w2[0] = __byte_perm_S (w1[0], w0[3], selector); w1[3] = __byte_perm_S (w0[3], w0[2], selector); w1[2] = __byte_perm_S (w0[2], w0[1], selector); w1[1] = __byte_perm_S (w0[1], w0[0], selector); w1[0] = __byte_perm_S (w0[0], 0, selector); w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 5: w7[3] = __byte_perm_S (w6[2], w6[1], selector); w7[2] = __byte_perm_S (w6[1], w6[0], selector); w7[1] = __byte_perm_S (w6[0], w5[3], selector); w7[0] = __byte_perm_S (w5[3], w5[2], selector); w6[3] = __byte_perm_S (w5[2], w5[1], selector); w6[2] = __byte_perm_S (w5[1], w5[0], selector); w6[1] = __byte_perm_S (w5[0], w4[3], selector); w6[0] = __byte_perm_S (w4[3], w4[2], selector); w5[3] = __byte_perm_S (w4[2], w4[1], selector); w5[2] = __byte_perm_S (w4[1], w4[0], selector); w5[1] = __byte_perm_S (w4[0], w3[3], selector); w5[0] = __byte_perm_S (w3[3], w3[2], selector); w4[3] = __byte_perm_S (w3[2], w3[1], selector); w4[2] = __byte_perm_S (w3[1], w3[0], selector); w4[1] = __byte_perm_S (w3[0], w2[3], selector); w4[0] = __byte_perm_S (w2[3], w2[2], selector); w3[3] = __byte_perm_S (w2[2], w2[1], selector); w3[2] = __byte_perm_S (w2[1], w2[0], selector); w3[1] = __byte_perm_S (w2[0], w1[3], selector); w3[0] = __byte_perm_S (w1[3], w1[2], selector); w2[3] = __byte_perm_S (w1[2], w1[1], selector); w2[2] = __byte_perm_S (w1[1], w1[0], selector); w2[1] = __byte_perm_S (w1[0], w0[3], selector); w2[0] = __byte_perm_S (w0[3], w0[2], selector); w1[3] = __byte_perm_S (w0[2], w0[1], selector); w1[2] = __byte_perm_S (w0[1], w0[0], selector); w1[1] = __byte_perm_S (w0[0], 0, selector); w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 6: w7[3] = __byte_perm_S (w6[1], w6[0], selector); w7[2] = __byte_perm_S (w6[0], w5[3], selector); w7[1] = __byte_perm_S (w5[3], w5[2], selector); w7[0] = __byte_perm_S (w5[2], w5[1], selector); w6[3] = __byte_perm_S (w5[1], w5[0], selector); w6[2] = __byte_perm_S (w5[0], w4[3], selector); w6[1] = __byte_perm_S (w4[3], w4[2], selector); w6[0] = __byte_perm_S (w4[2], w4[1], selector); w5[3] = __byte_perm_S (w4[1], w4[0], selector); w5[2] = __byte_perm_S (w4[0], w3[3], selector); w5[1] = __byte_perm_S (w3[3], w3[2], selector); w5[0] = __byte_perm_S (w3[2], w3[1], selector); w4[3] = __byte_perm_S (w3[1], w3[0], selector); w4[2] = __byte_perm_S (w3[0], w2[3], selector); w4[1] = __byte_perm_S (w2[3], w2[2], selector); w4[0] = __byte_perm_S (w2[2], w2[1], selector); w3[3] = __byte_perm_S (w2[1], w2[0], selector); w3[2] = __byte_perm_S (w2[0], w1[3], selector); w3[1] = __byte_perm_S (w1[3], w1[2], selector); w3[0] = __byte_perm_S (w1[2], w1[1], selector); w2[3] = __byte_perm_S (w1[1], w1[0], selector); w2[2] = __byte_perm_S (w1[0], w0[3], selector); w2[1] = __byte_perm_S (w0[3], w0[2], selector); w2[0] = __byte_perm_S (w0[2], w0[1], selector); w1[3] = __byte_perm_S (w0[1], w0[0], selector); w1[2] = __byte_perm_S (w0[0], 0, selector); w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 7: w7[3] = __byte_perm_S (w6[0], w5[3], selector); w7[2] = __byte_perm_S (w5[3], w5[2], selector); w7[1] = __byte_perm_S (w5[2], w5[1], selector); w7[0] = __byte_perm_S (w5[1], w5[0], selector); w6[3] = __byte_perm_S (w5[0], w4[3], selector); w6[2] = __byte_perm_S (w4[3], w4[2], selector); w6[1] = __byte_perm_S (w4[2], w4[1], selector); w6[0] = __byte_perm_S (w4[1], w4[0], selector); w5[3] = __byte_perm_S (w4[0], w3[3], selector); w5[2] = __byte_perm_S (w3[3], w3[2], selector); w5[1] = __byte_perm_S (w3[2], w3[1], selector); w5[0] = __byte_perm_S (w3[1], w3[0], selector); w4[3] = __byte_perm_S (w3[0], w2[3], selector); w4[2] = __byte_perm_S (w2[3], w2[2], selector); w4[1] = __byte_perm_S (w2[2], w2[1], selector); w4[0] = __byte_perm_S (w2[1], w2[0], selector); w3[3] = __byte_perm_S (w2[0], w1[3], selector); w3[2] = __byte_perm_S (w1[3], w1[2], selector); w3[1] = __byte_perm_S (w1[2], w1[1], selector); w3[0] = __byte_perm_S (w1[1], w1[0], selector); w2[3] = __byte_perm_S (w1[0], w0[3], selector); w2[2] = __byte_perm_S (w0[3], w0[2], selector); w2[1] = __byte_perm_S (w0[2], w0[1], selector); w2[0] = __byte_perm_S (w0[1], w0[0], selector); w1[3] = __byte_perm_S (w0[0], 0, selector); w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 8: w7[3] = __byte_perm_S (w5[3], w5[2], selector); w7[2] = __byte_perm_S (w5[2], w5[1], selector); w7[1] = __byte_perm_S (w5[1], w5[0], selector); w7[0] = __byte_perm_S (w5[0], w4[3], selector); w6[3] = __byte_perm_S (w4[3], w4[2], selector); w6[2] = __byte_perm_S (w4[2], w4[1], selector); w6[1] = __byte_perm_S (w4[1], w4[0], selector); w6[0] = __byte_perm_S (w4[0], w3[3], selector); w5[3] = __byte_perm_S (w3[3], w3[2], selector); w5[2] = __byte_perm_S (w3[2], w3[1], selector); w5[1] = __byte_perm_S (w3[1], w3[0], selector); w5[0] = __byte_perm_S (w3[0], w2[3], selector); w4[3] = __byte_perm_S (w2[3], w2[2], selector); w4[2] = __byte_perm_S (w2[2], w2[1], selector); w4[1] = __byte_perm_S (w2[1], w2[0], selector); w4[0] = __byte_perm_S (w2[0], w1[3], selector); w3[3] = __byte_perm_S (w1[3], w1[2], selector); w3[2] = __byte_perm_S (w1[2], w1[1], selector); w3[1] = __byte_perm_S (w1[1], w1[0], selector); w3[0] = __byte_perm_S (w1[0], w0[3], selector); w2[3] = __byte_perm_S (w0[3], w0[2], selector); w2[2] = __byte_perm_S (w0[2], w0[1], selector); w2[1] = __byte_perm_S (w0[1], w0[0], selector); w2[0] = __byte_perm_S (w0[0], 0, selector); w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 9: w7[3] = __byte_perm_S (w5[2], w5[1], selector); w7[2] = __byte_perm_S (w5[1], w5[0], selector); w7[1] = __byte_perm_S (w5[0], w4[3], selector); w7[0] = __byte_perm_S (w4[3], w4[2], selector); w6[3] = __byte_perm_S (w4[2], w4[1], selector); w6[2] = __byte_perm_S (w4[1], w4[0], selector); w6[1] = __byte_perm_S (w4[0], w3[3], selector); w6[0] = __byte_perm_S (w3[3], w3[2], selector); w5[3] = __byte_perm_S (w3[2], w3[1], selector); w5[2] = __byte_perm_S (w3[1], w3[0], selector); w5[1] = __byte_perm_S (w3[0], w2[3], selector); w5[0] = __byte_perm_S (w2[3], w2[2], selector); w4[3] = __byte_perm_S (w2[2], w2[1], selector); w4[2] = __byte_perm_S (w2[1], w2[0], selector); w4[1] = __byte_perm_S (w2[0], w1[3], selector); w4[0] = __byte_perm_S (w1[3], w1[2], selector); w3[3] = __byte_perm_S (w1[2], w1[1], selector); w3[2] = __byte_perm_S (w1[1], w1[0], selector); w3[1] = __byte_perm_S (w1[0], w0[3], selector); w3[0] = __byte_perm_S (w0[3], w0[2], selector); w2[3] = __byte_perm_S (w0[2], w0[1], selector); w2[2] = __byte_perm_S (w0[1], w0[0], selector); w2[1] = __byte_perm_S (w0[0], 0, selector); w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 10: w7[3] = __byte_perm_S (w5[1], w5[0], selector); w7[2] = __byte_perm_S (w5[0], w4[3], selector); w7[1] = __byte_perm_S (w4[3], w4[2], selector); w7[0] = __byte_perm_S (w4[2], w4[1], selector); w6[3] = __byte_perm_S (w4[1], w4[0], selector); w6[2] = __byte_perm_S (w4[0], w3[3], selector); w6[1] = __byte_perm_S (w3[3], w3[2], selector); w6[0] = __byte_perm_S (w3[2], w3[1], selector); w5[3] = __byte_perm_S (w3[1], w3[0], selector); w5[2] = __byte_perm_S (w3[0], w2[3], selector); w5[1] = __byte_perm_S (w2[3], w2[2], selector); w5[0] = __byte_perm_S (w2[2], w2[1], selector); w4[3] = __byte_perm_S (w2[1], w2[0], selector); w4[2] = __byte_perm_S (w2[0], w1[3], selector); w4[1] = __byte_perm_S (w1[3], w1[2], selector); w4[0] = __byte_perm_S (w1[2], w1[1], selector); w3[3] = __byte_perm_S (w1[1], w1[0], selector); w3[2] = __byte_perm_S (w1[0], w0[3], selector); w3[1] = __byte_perm_S (w0[3], w0[2], selector); w3[0] = __byte_perm_S (w0[2], w0[1], selector); w2[3] = __byte_perm_S (w0[1], w0[0], selector); w2[2] = __byte_perm_S (w0[0], 0, selector); w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 11: w7[3] = __byte_perm_S (w5[0], w4[3], selector); w7[2] = __byte_perm_S (w4[3], w4[2], selector); w7[1] = __byte_perm_S (w4[2], w4[1], selector); w7[0] = __byte_perm_S (w4[1], w4[0], selector); w6[3] = __byte_perm_S (w4[0], w3[3], selector); w6[2] = __byte_perm_S (w3[3], w3[2], selector); w6[1] = __byte_perm_S (w3[2], w3[1], selector); w6[0] = __byte_perm_S (w3[1], w3[0], selector); w5[3] = __byte_perm_S (w3[0], w2[3], selector); w5[2] = __byte_perm_S (w2[3], w2[2], selector); w5[1] = __byte_perm_S (w2[2], w2[1], selector); w5[0] = __byte_perm_S (w2[1], w2[0], selector); w4[3] = __byte_perm_S (w2[0], w1[3], selector); w4[2] = __byte_perm_S (w1[3], w1[2], selector); w4[1] = __byte_perm_S (w1[2], w1[1], selector); w4[0] = __byte_perm_S (w1[1], w1[0], selector); w3[3] = __byte_perm_S (w1[0], w0[3], selector); w3[2] = __byte_perm_S (w0[3], w0[2], selector); w3[1] = __byte_perm_S (w0[2], w0[1], selector); w3[0] = __byte_perm_S (w0[1], w0[0], selector); w2[3] = __byte_perm_S (w0[0], 0, selector); w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 12: w7[3] = __byte_perm_S (w4[3], w4[2], selector); w7[2] = __byte_perm_S (w4[2], w4[1], selector); w7[1] = __byte_perm_S (w4[1], w4[0], selector); w7[0] = __byte_perm_S (w4[0], w3[3], selector); w6[3] = __byte_perm_S (w3[3], w3[2], selector); w6[2] = __byte_perm_S (w3[2], w3[1], selector); w6[1] = __byte_perm_S (w3[1], w3[0], selector); w6[0] = __byte_perm_S (w3[0], w2[3], selector); w5[3] = __byte_perm_S (w2[3], w2[2], selector); w5[2] = __byte_perm_S (w2[2], w2[1], selector); w5[1] = __byte_perm_S (w2[1], w2[0], selector); w5[0] = __byte_perm_S (w2[0], w1[3], selector); w4[3] = __byte_perm_S (w1[3], w1[2], selector); w4[2] = __byte_perm_S (w1[2], w1[1], selector); w4[1] = __byte_perm_S (w1[1], w1[0], selector); w4[0] = __byte_perm_S (w1[0], w0[3], selector); w3[3] = __byte_perm_S (w0[3], w0[2], selector); w3[2] = __byte_perm_S (w0[2], w0[1], selector); w3[1] = __byte_perm_S (w0[1], w0[0], selector); w3[0] = __byte_perm_S (w0[0], 0, selector); w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 13: w7[3] = __byte_perm_S (w4[2], w4[1], selector); w7[2] = __byte_perm_S (w4[1], w4[0], selector); w7[1] = __byte_perm_S (w4[0], w3[3], selector); w7[0] = __byte_perm_S (w3[3], w3[2], selector); w6[3] = __byte_perm_S (w3[2], w3[1], selector); w6[2] = __byte_perm_S (w3[1], w3[0], selector); w6[1] = __byte_perm_S (w3[0], w2[3], selector); w6[0] = __byte_perm_S (w2[3], w2[2], selector); w5[3] = __byte_perm_S (w2[2], w2[1], selector); w5[2] = __byte_perm_S (w2[1], w2[0], selector); w5[1] = __byte_perm_S (w2[0], w1[3], selector); w5[0] = __byte_perm_S (w1[3], w1[2], selector); w4[3] = __byte_perm_S (w1[2], w1[1], selector); w4[2] = __byte_perm_S (w1[1], w1[0], selector); w4[1] = __byte_perm_S (w1[0], w0[3], selector); w4[0] = __byte_perm_S (w0[3], w0[2], selector); w3[3] = __byte_perm_S (w0[2], w0[1], selector); w3[2] = __byte_perm_S (w0[1], w0[0], selector); w3[1] = __byte_perm_S (w0[0], 0, selector); w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 14: w7[3] = __byte_perm_S (w4[1], w4[0], selector); w7[2] = __byte_perm_S (w4[0], w3[3], selector); w7[1] = __byte_perm_S (w3[3], w3[2], selector); w7[0] = __byte_perm_S (w3[2], w3[1], selector); w6[3] = __byte_perm_S (w3[1], w3[0], selector); w6[2] = __byte_perm_S (w3[0], w2[3], selector); w6[1] = __byte_perm_S (w2[3], w2[2], selector); w6[0] = __byte_perm_S (w2[2], w2[1], selector); w5[3] = __byte_perm_S (w2[1], w2[0], selector); w5[2] = __byte_perm_S (w2[0], w1[3], selector); w5[1] = __byte_perm_S (w1[3], w1[2], selector); w5[0] = __byte_perm_S (w1[2], w1[1], selector); w4[3] = __byte_perm_S (w1[1], w1[0], selector); w4[2] = __byte_perm_S (w1[0], w0[3], selector); w4[1] = __byte_perm_S (w0[3], w0[2], selector); w4[0] = __byte_perm_S (w0[2], w0[1], selector); w3[3] = __byte_perm_S (w0[1], w0[0], selector); w3[2] = __byte_perm_S (w0[0], 0, selector); w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 15: w7[3] = __byte_perm_S (w4[0], w3[3], selector); w7[2] = __byte_perm_S (w3[3], w3[2], selector); w7[1] = __byte_perm_S (w3[2], w3[1], selector); w7[0] = __byte_perm_S (w3[1], w3[0], selector); w6[3] = __byte_perm_S (w3[0], w2[3], selector); w6[2] = __byte_perm_S (w2[3], w2[2], selector); w6[1] = __byte_perm_S (w2[2], w2[1], selector); w6[0] = __byte_perm_S (w2[1], w2[0], selector); w5[3] = __byte_perm_S (w2[0], w1[3], selector); w5[2] = __byte_perm_S (w1[3], w1[2], selector); w5[1] = __byte_perm_S (w1[2], w1[1], selector); w5[0] = __byte_perm_S (w1[1], w1[0], selector); w4[3] = __byte_perm_S (w1[0], w0[3], selector); w4[2] = __byte_perm_S (w0[3], w0[2], selector); w4[1] = __byte_perm_S (w0[2], w0[1], selector); w4[0] = __byte_perm_S (w0[1], w0[0], selector); w3[3] = __byte_perm_S (w0[0], 0, selector); w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 16: w7[3] = __byte_perm_S (w3[3], w3[2], selector); w7[2] = __byte_perm_S (w3[2], w3[1], selector); w7[1] = __byte_perm_S (w3[1], w3[0], selector); w7[0] = __byte_perm_S (w3[0], w2[3], selector); w6[3] = __byte_perm_S (w2[3], w2[2], selector); w6[2] = __byte_perm_S (w2[2], w2[1], selector); w6[1] = __byte_perm_S (w2[1], w2[0], selector); w6[0] = __byte_perm_S (w2[0], w1[3], selector); w5[3] = __byte_perm_S (w1[3], w1[2], selector); w5[2] = __byte_perm_S (w1[2], w1[1], selector); w5[1] = __byte_perm_S (w1[1], w1[0], selector); w5[0] = __byte_perm_S (w1[0], w0[3], selector); w4[3] = __byte_perm_S (w0[3], w0[2], selector); w4[2] = __byte_perm_S (w0[2], w0[1], selector); w4[1] = __byte_perm_S (w0[1], w0[0], selector); w4[0] = __byte_perm_S (w0[0], 0, selector); w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 17: w7[3] = __byte_perm_S (w3[2], w3[1], selector); w7[2] = __byte_perm_S (w3[1], w3[0], selector); w7[1] = __byte_perm_S (w3[0], w2[3], selector); w7[0] = __byte_perm_S (w2[3], w2[2], selector); w6[3] = __byte_perm_S (w2[2], w2[1], selector); w6[2] = __byte_perm_S (w2[1], w2[0], selector); w6[1] = __byte_perm_S (w2[0], w1[3], selector); w6[0] = __byte_perm_S (w1[3], w1[2], selector); w5[3] = __byte_perm_S (w1[2], w1[1], selector); w5[2] = __byte_perm_S (w1[1], w1[0], selector); w5[1] = __byte_perm_S (w1[0], w0[3], selector); w5[0] = __byte_perm_S (w0[3], w0[2], selector); w4[3] = __byte_perm_S (w0[2], w0[1], selector); w4[2] = __byte_perm_S (w0[1], w0[0], selector); w4[1] = __byte_perm_S (w0[0], 0, selector); w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 18: w7[3] = __byte_perm_S (w3[1], w3[0], selector); w7[2] = __byte_perm_S (w3[0], w2[3], selector); w7[1] = __byte_perm_S (w2[3], w2[2], selector); w7[0] = __byte_perm_S (w2[2], w2[1], selector); w6[3] = __byte_perm_S (w2[1], w2[0], selector); w6[2] = __byte_perm_S (w2[0], w1[3], selector); w6[1] = __byte_perm_S (w1[3], w1[2], selector); w6[0] = __byte_perm_S (w1[2], w1[1], selector); w5[3] = __byte_perm_S (w1[1], w1[0], selector); w5[2] = __byte_perm_S (w1[0], w0[3], selector); w5[1] = __byte_perm_S (w0[3], w0[2], selector); w5[0] = __byte_perm_S (w0[2], w0[1], selector); w4[3] = __byte_perm_S (w0[1], w0[0], selector); w4[2] = __byte_perm_S (w0[0], 0, selector); w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 19: w7[3] = __byte_perm_S (w3[0], w2[3], selector); w7[2] = __byte_perm_S (w2[3], w2[2], selector); w7[1] = __byte_perm_S (w2[2], w2[1], selector); w7[0] = __byte_perm_S (w2[1], w2[0], selector); w6[3] = __byte_perm_S (w2[0], w1[3], selector); w6[2] = __byte_perm_S (w1[3], w1[2], selector); w6[1] = __byte_perm_S (w1[2], w1[1], selector); w6[0] = __byte_perm_S (w1[1], w1[0], selector); w5[3] = __byte_perm_S (w1[0], w0[3], selector); w5[2] = __byte_perm_S (w0[3], w0[2], selector); w5[1] = __byte_perm_S (w0[2], w0[1], selector); w5[0] = __byte_perm_S (w0[1], w0[0], selector); w4[3] = __byte_perm_S (w0[0], 0, selector); w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 20: w7[3] = __byte_perm_S (w2[3], w2[2], selector); w7[2] = __byte_perm_S (w2[2], w2[1], selector); w7[1] = __byte_perm_S (w2[1], w2[0], selector); w7[0] = __byte_perm_S (w2[0], w1[3], selector); w6[3] = __byte_perm_S (w1[3], w1[2], selector); w6[2] = __byte_perm_S (w1[2], w1[1], selector); w6[1] = __byte_perm_S (w1[1], w1[0], selector); w6[0] = __byte_perm_S (w1[0], w0[3], selector); w5[3] = __byte_perm_S (w0[3], w0[2], selector); w5[2] = __byte_perm_S (w0[2], w0[1], selector); w5[1] = __byte_perm_S (w0[1], w0[0], selector); w5[0] = __byte_perm_S (w0[0], 0, selector); w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 21: w7[3] = __byte_perm_S (w2[2], w2[1], selector); w7[2] = __byte_perm_S (w2[1], w2[0], selector); w7[1] = __byte_perm_S (w2[0], w1[3], selector); w7[0] = __byte_perm_S (w1[3], w1[2], selector); w6[3] = __byte_perm_S (w1[2], w1[1], selector); w6[2] = __byte_perm_S (w1[1], w1[0], selector); w6[1] = __byte_perm_S (w1[0], w0[3], selector); w6[0] = __byte_perm_S (w0[3], w0[2], selector); w5[3] = __byte_perm_S (w0[2], w0[1], selector); w5[2] = __byte_perm_S (w0[1], w0[0], selector); w5[1] = __byte_perm_S (w0[0], 0, selector); w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 22: w7[3] = __byte_perm_S (w2[1], w2[0], selector); w7[2] = __byte_perm_S (w2[0], w1[3], selector); w7[1] = __byte_perm_S (w1[3], w1[2], selector); w7[0] = __byte_perm_S (w1[2], w1[1], selector); w6[3] = __byte_perm_S (w1[1], w1[0], selector); w6[2] = __byte_perm_S (w1[0], w0[3], selector); w6[1] = __byte_perm_S (w0[3], w0[2], selector); w6[0] = __byte_perm_S (w0[2], w0[1], selector); w5[3] = __byte_perm_S (w0[1], w0[0], selector); w5[2] = __byte_perm_S (w0[0], 0, selector); w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 23: w7[3] = __byte_perm_S (w2[0], w1[3], selector); w7[2] = __byte_perm_S (w1[3], w1[2], selector); w7[1] = __byte_perm_S (w1[2], w1[1], selector); w7[0] = __byte_perm_S (w1[1], w1[0], selector); w6[3] = __byte_perm_S (w1[0], w0[3], selector); w6[2] = __byte_perm_S (w0[3], w0[2], selector); w6[1] = __byte_perm_S (w0[2], w0[1], selector); w6[0] = __byte_perm_S (w0[1], w0[0], selector); w5[3] = __byte_perm_S (w0[0], 0, selector); w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 24: w7[3] = __byte_perm_S (w1[3], w1[2], selector); w7[2] = __byte_perm_S (w1[2], w1[1], selector); w7[1] = __byte_perm_S (w1[1], w1[0], selector); w7[0] = __byte_perm_S (w1[0], w0[3], selector); w6[3] = __byte_perm_S (w0[3], w0[2], selector); w6[2] = __byte_perm_S (w0[2], w0[1], selector); w6[1] = __byte_perm_S (w0[1], w0[0], selector); w6[0] = __byte_perm_S (w0[0], 0, selector); w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 25: w7[3] = __byte_perm_S (w1[2], w1[1], selector); w7[2] = __byte_perm_S (w1[1], w1[0], selector); w7[1] = __byte_perm_S (w1[0], w0[3], selector); w7[0] = __byte_perm_S (w0[3], w0[2], selector); w6[3] = __byte_perm_S (w0[2], w0[1], selector); w6[2] = __byte_perm_S (w0[1], w0[0], selector); w6[1] = __byte_perm_S (w0[0], 0, selector); w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 26: w7[3] = __byte_perm_S (w1[1], w1[0], selector); w7[2] = __byte_perm_S (w1[0], w0[3], selector); w7[1] = __byte_perm_S (w0[3], w0[2], selector); w7[0] = __byte_perm_S (w0[2], w0[1], selector); w6[3] = __byte_perm_S (w0[1], w0[0], selector); w6[2] = __byte_perm_S (w0[0], 0, selector); w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 27: w7[3] = __byte_perm_S (w1[0], w0[3], selector); w7[2] = __byte_perm_S (w0[3], w0[2], selector); w7[1] = __byte_perm_S (w0[2], w0[1], selector); w7[0] = __byte_perm_S (w0[1], w0[0], selector); w6[3] = __byte_perm_S (w0[0], 0, selector); w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 28: w7[3] = __byte_perm_S (w0[3], w0[2], selector); w7[2] = __byte_perm_S (w0[2], w0[1], selector); w7[1] = __byte_perm_S (w0[1], w0[0], selector); w7[0] = __byte_perm_S (w0[0], 0, selector); w6[3] = 0; w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 29: w7[3] = __byte_perm_S (w0[2], w0[1], selector); w7[2] = __byte_perm_S (w0[1], w0[0], selector); w7[1] = __byte_perm_S (w0[0], 0, selector); w7[0] = 0; w6[3] = 0; w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 30: w7[3] = __byte_perm_S (w0[1], w0[0], selector); w7[2] = __byte_perm_S (w0[0], 0, selector); w7[1] = 0; w7[0] = 0; w6[3] = 0; w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 31: w7[3] = __byte_perm_S (w0[0], 0, selector); w7[2] = 0; w7[1] = 0; w7[0] = 0; w6[3] = 0; w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; } #endif } DECLSPEC void switch_buffer_by_offset_8x4_carry_be_S (u32 w0[4], u32 w1[4], u32 w2[4], u32 w3[4], u32 w4[4], u32 w5[4], u32 w6[4], u32 w7[4], u32 c0[4], u32 c1[4], u32 c2[4], u32 c3[4], u32 c4[4], u32 c5[4], u32 c6[4], u32 c7[4], const u32 offset) { const int offset_switch = offset / 4; #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC switch (offset_switch) { case 0: c0[0] = amd_bytealign_S (w7[3], 0, offset); w7[3] = amd_bytealign_S (w7[2], w7[3], offset); w7[2] = amd_bytealign_S (w7[1], w7[2], offset); w7[1] = amd_bytealign_S (w7[0], w7[1], offset); w7[0] = amd_bytealign_S (w6[3], w7[0], offset); w6[3] = amd_bytealign_S (w6[2], w6[3], offset); w6[2] = amd_bytealign_S (w6[1], w6[2], offset); w6[1] = amd_bytealign_S (w6[0], w6[1], offset); w6[0] = amd_bytealign_S (w5[3], w6[0], offset); w5[3] = amd_bytealign_S (w5[2], w5[3], offset); w5[2] = amd_bytealign_S (w5[1], w5[2], offset); w5[1] = amd_bytealign_S (w5[0], w5[1], offset); w5[0] = amd_bytealign_S (w4[3], w5[0], offset); w4[3] = amd_bytealign_S (w4[2], w4[3], offset); w4[2] = amd_bytealign_S (w4[1], w4[2], offset); w4[1] = amd_bytealign_S (w4[0], w4[1], offset); w4[0] = amd_bytealign_S (w3[3], w4[0], offset); w3[3] = amd_bytealign_S (w3[2], w3[3], offset); w3[2] = amd_bytealign_S (w3[1], w3[2], offset); w3[1] = amd_bytealign_S (w3[0], w3[1], offset); w3[0] = amd_bytealign_S (w2[3], w3[0], offset); w2[3] = amd_bytealign_S (w2[2], w2[3], offset); w2[2] = amd_bytealign_S (w2[1], w2[2], offset); w2[1] = amd_bytealign_S (w2[0], w2[1], offset); w2[0] = amd_bytealign_S (w1[3], w2[0], offset); w1[3] = amd_bytealign_S (w1[2], w1[3], offset); w1[2] = amd_bytealign_S (w1[1], w1[2], offset); w1[1] = amd_bytealign_S (w1[0], w1[1], offset); w1[0] = amd_bytealign_S (w0[3], w1[0], offset); w0[3] = amd_bytealign_S (w0[2], w0[3], offset); w0[2] = amd_bytealign_S (w0[1], w0[2], offset); w0[1] = amd_bytealign_S (w0[0], w0[1], offset); w0[0] = amd_bytealign_S ( 0, w0[0], offset); break; case 1: c0[1] = amd_bytealign_S (w7[3], 0, offset); c0[0] = amd_bytealign_S (w7[2], w7[3], offset); w7[3] = amd_bytealign_S (w7[1], w7[2], offset); w7[2] = amd_bytealign_S (w7[0], w7[1], offset); w7[1] = amd_bytealign_S (w6[3], w7[0], offset); w7[0] = amd_bytealign_S (w6[2], w6[3], offset); w6[3] = amd_bytealign_S (w6[1], w6[2], offset); w6[2] = amd_bytealign_S (w6[0], w6[1], offset); w6[1] = amd_bytealign_S (w5[3], w6[0], offset); w6[0] = amd_bytealign_S (w5[2], w5[3], offset); w5[3] = amd_bytealign_S (w5[1], w5[2], offset); w5[2] = amd_bytealign_S (w5[0], w5[1], offset); w5[1] = amd_bytealign_S (w4[3], w5[0], offset); w5[0] = amd_bytealign_S (w4[2], w4[3], offset); w4[3] = amd_bytealign_S (w4[1], w4[2], offset); w4[2] = amd_bytealign_S (w4[0], w4[1], offset); w4[1] = amd_bytealign_S (w3[3], w4[0], offset); w4[0] = amd_bytealign_S (w3[2], w3[3], offset); w3[3] = amd_bytealign_S (w3[1], w3[2], offset); w3[2] = amd_bytealign_S (w3[0], w3[1], offset); w3[1] = amd_bytealign_S (w2[3], w3[0], offset); w3[0] = amd_bytealign_S (w2[2], w2[3], offset); w2[3] = amd_bytealign_S (w2[1], w2[2], offset); w2[2] = amd_bytealign_S (w2[0], w2[1], offset); w2[1] = amd_bytealign_S (w1[3], w2[0], offset); w2[0] = amd_bytealign_S (w1[2], w1[3], offset); w1[3] = amd_bytealign_S (w1[1], w1[2], offset); w1[2] = amd_bytealign_S (w1[0], w1[1], offset); w1[1] = amd_bytealign_S (w0[3], w1[0], offset); w1[0] = amd_bytealign_S (w0[2], w0[3], offset); w0[3] = amd_bytealign_S (w0[1], w0[2], offset); w0[2] = amd_bytealign_S (w0[0], w0[1], offset); w0[1] = amd_bytealign_S ( 0, w0[0], offset); w0[0] = 0; break; case 2: c0[2] = amd_bytealign_S (w7[3], 0, offset); c0[1] = amd_bytealign_S (w7[2], w7[3], offset); c0[0] = amd_bytealign_S (w7[1], w7[2], offset); w7[3] = amd_bytealign_S (w7[0], w7[1], offset); w7[2] = amd_bytealign_S (w6[3], w7[0], offset); w7[1] = amd_bytealign_S (w6[2], w6[3], offset); w7[0] = amd_bytealign_S (w6[1], w6[2], offset); w6[3] = amd_bytealign_S (w6[0], w6[1], offset); w6[2] = amd_bytealign_S (w5[3], w6[0], offset); w6[1] = amd_bytealign_S (w5[2], w5[3], offset); w6[0] = amd_bytealign_S (w5[1], w5[2], offset); w5[3] = amd_bytealign_S (w5[0], w5[1], offset); w5[2] = amd_bytealign_S (w4[3], w5[0], offset); w5[1] = amd_bytealign_S (w4[2], w4[3], offset); w5[0] = amd_bytealign_S (w4[1], w4[2], offset); w4[3] = amd_bytealign_S (w4[0], w4[1], offset); w4[2] = amd_bytealign_S (w3[3], w4[0], offset); w4[1] = amd_bytealign_S (w3[2], w3[3], offset); w4[0] = amd_bytealign_S (w3[1], w3[2], offset); w3[3] = amd_bytealign_S (w3[0], w3[1], offset); w3[2] = amd_bytealign_S (w2[3], w3[0], offset); w3[1] = amd_bytealign_S (w2[2], w2[3], offset); w3[0] = amd_bytealign_S (w2[1], w2[2], offset); w2[3] = amd_bytealign_S (w2[0], w2[1], offset); w2[2] = amd_bytealign_S (w1[3], w2[0], offset); w2[1] = amd_bytealign_S (w1[2], w1[3], offset); w2[0] = amd_bytealign_S (w1[1], w1[2], offset); w1[3] = amd_bytealign_S (w1[0], w1[1], offset); w1[2] = amd_bytealign_S (w0[3], w1[0], offset); w1[1] = amd_bytealign_S (w0[2], w0[3], offset); w1[0] = amd_bytealign_S (w0[1], w0[2], offset); w0[3] = amd_bytealign_S (w0[0], w0[1], offset); w0[2] = amd_bytealign_S ( 0, w0[0], offset); w0[1] = 0; w0[0] = 0; break; case 3: c0[3] = amd_bytealign_S (w7[3], 0, offset); c0[2] = amd_bytealign_S (w7[2], w7[3], offset); c0[1] = amd_bytealign_S (w7[1], w7[2], offset); c0[0] = amd_bytealign_S (w7[0], w7[1], offset); w7[3] = amd_bytealign_S (w6[3], w7[0], offset); w7[2] = amd_bytealign_S (w6[2], w6[3], offset); w7[1] = amd_bytealign_S (w6[1], w6[2], offset); w7[0] = amd_bytealign_S (w6[0], w6[1], offset); w6[3] = amd_bytealign_S (w5[3], w6[0], offset); w6[2] = amd_bytealign_S (w5[2], w5[3], offset); w6[1] = amd_bytealign_S (w5[1], w5[2], offset); w6[0] = amd_bytealign_S (w5[0], w5[1], offset); w5[3] = amd_bytealign_S (w4[3], w5[0], offset); w5[2] = amd_bytealign_S (w4[2], w4[3], offset); w5[1] = amd_bytealign_S (w4[1], w4[2], offset); w5[0] = amd_bytealign_S (w4[0], w4[1], offset); w4[3] = amd_bytealign_S (w3[3], w4[0], offset); w4[2] = amd_bytealign_S (w3[2], w3[3], offset); w4[1] = amd_bytealign_S (w3[1], w3[2], offset); w4[0] = amd_bytealign_S (w3[0], w3[1], offset); w3[3] = amd_bytealign_S (w2[3], w3[0], offset); w3[2] = amd_bytealign_S (w2[2], w2[3], offset); w3[1] = amd_bytealign_S (w2[1], w2[2], offset); w3[0] = amd_bytealign_S (w2[0], w2[1], offset); w2[3] = amd_bytealign_S (w1[3], w2[0], offset); w2[2] = amd_bytealign_S (w1[2], w1[3], offset); w2[1] = amd_bytealign_S (w1[1], w1[2], offset); w2[0] = amd_bytealign_S (w1[0], w1[1], offset); w1[3] = amd_bytealign_S (w0[3], w1[0], offset); w1[2] = amd_bytealign_S (w0[2], w0[3], offset); w1[1] = amd_bytealign_S (w0[1], w0[2], offset); w1[0] = amd_bytealign_S (w0[0], w0[1], offset); w0[3] = amd_bytealign_S ( 0, w0[0], offset); w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 4: c1[0] = amd_bytealign_S (w7[3], 0, offset); c0[3] = amd_bytealign_S (w7[2], w7[3], offset); c0[2] = amd_bytealign_S (w7[1], w7[2], offset); c0[1] = amd_bytealign_S (w7[0], w7[1], offset); c0[0] = amd_bytealign_S (w6[3], w7[0], offset); w7[3] = amd_bytealign_S (w6[2], w6[3], offset); w7[2] = amd_bytealign_S (w6[1], w6[2], offset); w7[1] = amd_bytealign_S (w6[0], w6[1], offset); w7[0] = amd_bytealign_S (w5[3], w6[0], offset); w6[3] = amd_bytealign_S (w5[2], w5[3], offset); w6[2] = amd_bytealign_S (w5[1], w5[2], offset); w6[1] = amd_bytealign_S (w5[0], w5[1], offset); w6[0] = amd_bytealign_S (w4[3], w5[0], offset); w5[3] = amd_bytealign_S (w4[2], w4[3], offset); w5[2] = amd_bytealign_S (w4[1], w4[2], offset); w5[1] = amd_bytealign_S (w4[0], w4[1], offset); w5[0] = amd_bytealign_S (w3[3], w4[0], offset); w4[3] = amd_bytealign_S (w3[2], w3[3], offset); w4[2] = amd_bytealign_S (w3[1], w3[2], offset); w4[1] = amd_bytealign_S (w3[0], w3[1], offset); w4[0] = amd_bytealign_S (w2[3], w3[0], offset); w3[3] = amd_bytealign_S (w2[2], w2[3], offset); w3[2] = amd_bytealign_S (w2[1], w2[2], offset); w3[1] = amd_bytealign_S (w2[0], w2[1], offset); w3[0] = amd_bytealign_S (w1[3], w2[0], offset); w2[3] = amd_bytealign_S (w1[2], w1[3], offset); w2[2] = amd_bytealign_S (w1[1], w1[2], offset); w2[1] = amd_bytealign_S (w1[0], w1[1], offset); w2[0] = amd_bytealign_S (w0[3], w1[0], offset); w1[3] = amd_bytealign_S (w0[2], w0[3], offset); w1[2] = amd_bytealign_S (w0[1], w0[2], offset); w1[1] = amd_bytealign_S (w0[0], w0[1], offset); w1[0] = amd_bytealign_S ( 0, w0[0], offset); w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 5: c1[1] = amd_bytealign_S (w7[3], 0, offset); c1[0] = amd_bytealign_S (w7[2], w7[3], offset); c0[3] = amd_bytealign_S (w7[1], w7[2], offset); c0[2] = amd_bytealign_S (w7[0], w7[1], offset); c0[1] = amd_bytealign_S (w6[3], w7[0], offset); c0[0] = amd_bytealign_S (w6[2], w6[3], offset); w7[3] = amd_bytealign_S (w6[1], w6[2], offset); w7[2] = amd_bytealign_S (w6[0], w6[1], offset); w7[1] = amd_bytealign_S (w5[3], w6[0], offset); w7[0] = amd_bytealign_S (w5[2], w5[3], offset); w6[3] = amd_bytealign_S (w5[1], w5[2], offset); w6[2] = amd_bytealign_S (w5[0], w5[1], offset); w6[1] = amd_bytealign_S (w4[3], w5[0], offset); w6[0] = amd_bytealign_S (w4[2], w4[3], offset); w5[3] = amd_bytealign_S (w4[1], w4[2], offset); w5[2] = amd_bytealign_S (w4[0], w4[1], offset); w5[1] = amd_bytealign_S (w3[3], w4[0], offset); w5[0] = amd_bytealign_S (w3[2], w3[3], offset); w4[3] = amd_bytealign_S (w3[1], w3[2], offset); w4[2] = amd_bytealign_S (w3[0], w3[1], offset); w4[1] = amd_bytealign_S (w2[3], w3[0], offset); w4[0] = amd_bytealign_S (w2[2], w2[3], offset); w3[3] = amd_bytealign_S (w2[1], w2[2], offset); w3[2] = amd_bytealign_S (w2[0], w2[1], offset); w3[1] = amd_bytealign_S (w1[3], w2[0], offset); w3[0] = amd_bytealign_S (w1[2], w1[3], offset); w2[3] = amd_bytealign_S (w1[1], w1[2], offset); w2[2] = amd_bytealign_S (w1[0], w1[1], offset); w2[1] = amd_bytealign_S (w0[3], w1[0], offset); w2[0] = amd_bytealign_S (w0[2], w0[3], offset); w1[3] = amd_bytealign_S (w0[1], w0[2], offset); w1[2] = amd_bytealign_S (w0[0], w0[1], offset); w1[1] = amd_bytealign_S ( 0, w0[0], offset); w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 6: c1[2] = amd_bytealign_S (w7[3], 0, offset); c1[1] = amd_bytealign_S (w7[2], w7[3], offset); c1[0] = amd_bytealign_S (w7[1], w7[2], offset); c0[3] = amd_bytealign_S (w7[0], w7[1], offset); c0[2] = amd_bytealign_S (w6[3], w7[0], offset); c0[1] = amd_bytealign_S (w6[2], w6[3], offset); c0[0] = amd_bytealign_S (w6[1], w6[2], offset); w7[3] = amd_bytealign_S (w6[0], w6[1], offset); w7[2] = amd_bytealign_S (w5[3], w6[0], offset); w7[1] = amd_bytealign_S (w5[2], w5[3], offset); w7[0] = amd_bytealign_S (w5[1], w5[2], offset); w6[3] = amd_bytealign_S (w5[0], w5[1], offset); w6[2] = amd_bytealign_S (w4[3], w5[0], offset); w6[1] = amd_bytealign_S (w4[2], w4[3], offset); w6[0] = amd_bytealign_S (w4[1], w4[2], offset); w5[3] = amd_bytealign_S (w4[0], w4[1], offset); w5[2] = amd_bytealign_S (w3[3], w4[0], offset); w5[1] = amd_bytealign_S (w3[2], w3[3], offset); w5[0] = amd_bytealign_S (w3[1], w3[2], offset); w4[3] = amd_bytealign_S (w3[0], w3[1], offset); w4[2] = amd_bytealign_S (w2[3], w3[0], offset); w4[1] = amd_bytealign_S (w2[2], w2[3], offset); w4[0] = amd_bytealign_S (w2[1], w2[2], offset); w3[3] = amd_bytealign_S (w2[0], w2[1], offset); w3[2] = amd_bytealign_S (w1[3], w2[0], offset); w3[1] = amd_bytealign_S (w1[2], w1[3], offset); w3[0] = amd_bytealign_S (w1[1], w1[2], offset); w2[3] = amd_bytealign_S (w1[0], w1[1], offset); w2[2] = amd_bytealign_S (w0[3], w1[0], offset); w2[1] = amd_bytealign_S (w0[2], w0[3], offset); w2[0] = amd_bytealign_S (w0[1], w0[2], offset); w1[3] = amd_bytealign_S (w0[0], w0[1], offset); w1[2] = amd_bytealign_S ( 0, w0[0], offset); w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 7: c1[3] = amd_bytealign_S (w7[3], 0, offset); c1[2] = amd_bytealign_S (w7[2], w7[3], offset); c1[1] = amd_bytealign_S (w7[1], w7[2], offset); c1[0] = amd_bytealign_S (w7[0], w7[1], offset); c0[3] = amd_bytealign_S (w6[3], w7[0], offset); c0[2] = amd_bytealign_S (w6[2], w6[3], offset); c0[1] = amd_bytealign_S (w6[1], w6[2], offset); c0[0] = amd_bytealign_S (w6[0], w6[1], offset); w7[3] = amd_bytealign_S (w5[3], w6[0], offset); w7[2] = amd_bytealign_S (w5[2], w5[3], offset); w7[1] = amd_bytealign_S (w5[1], w5[2], offset); w7[0] = amd_bytealign_S (w5[0], w5[1], offset); w6[3] = amd_bytealign_S (w4[3], w5[0], offset); w6[2] = amd_bytealign_S (w4[2], w4[3], offset); w6[1] = amd_bytealign_S (w4[1], w4[2], offset); w6[0] = amd_bytealign_S (w4[0], w4[1], offset); w5[3] = amd_bytealign_S (w3[3], w4[0], offset); w5[2] = amd_bytealign_S (w3[2], w3[3], offset); w5[1] = amd_bytealign_S (w3[1], w3[2], offset); w5[0] = amd_bytealign_S (w3[0], w3[1], offset); w4[3] = amd_bytealign_S (w2[3], w3[0], offset); w4[2] = amd_bytealign_S (w2[2], w2[3], offset); w4[1] = amd_bytealign_S (w2[1], w2[2], offset); w4[0] = amd_bytealign_S (w2[0], w2[1], offset); w3[3] = amd_bytealign_S (w1[3], w2[0], offset); w3[2] = amd_bytealign_S (w1[2], w1[3], offset); w3[1] = amd_bytealign_S (w1[1], w1[2], offset); w3[0] = amd_bytealign_S (w1[0], w1[1], offset); w2[3] = amd_bytealign_S (w0[3], w1[0], offset); w2[2] = amd_bytealign_S (w0[2], w0[3], offset); w2[1] = amd_bytealign_S (w0[1], w0[2], offset); w2[0] = amd_bytealign_S (w0[0], w0[1], offset); w1[3] = amd_bytealign_S ( 0, w0[0], offset); w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 8: c2[0] = amd_bytealign_S (w7[3], 0, offset); c1[3] = amd_bytealign_S (w7[2], w7[3], offset); c1[2] = amd_bytealign_S (w7[1], w7[2], offset); c1[1] = amd_bytealign_S (w7[0], w7[1], offset); c1[0] = amd_bytealign_S (w6[3], w7[0], offset); c0[3] = amd_bytealign_S (w6[2], w6[3], offset); c0[2] = amd_bytealign_S (w6[1], w6[2], offset); c0[1] = amd_bytealign_S (w6[0], w6[1], offset); c0[0] = amd_bytealign_S (w5[3], w6[0], offset); w7[3] = amd_bytealign_S (w5[2], w5[3], offset); w7[2] = amd_bytealign_S (w5[1], w5[2], offset); w7[1] = amd_bytealign_S (w5[0], w5[1], offset); w7[0] = amd_bytealign_S (w4[3], w5[0], offset); w6[3] = amd_bytealign_S (w4[2], w4[3], offset); w6[2] = amd_bytealign_S (w4[1], w4[2], offset); w6[1] = amd_bytealign_S (w4[0], w4[1], offset); w6[0] = amd_bytealign_S (w3[3], w4[0], offset); w5[3] = amd_bytealign_S (w3[2], w3[3], offset); w5[2] = amd_bytealign_S (w3[1], w3[2], offset); w5[1] = amd_bytealign_S (w3[0], w3[1], offset); w5[0] = amd_bytealign_S (w2[3], w3[0], offset); w4[3] = amd_bytealign_S (w2[2], w2[3], offset); w4[2] = amd_bytealign_S (w2[1], w2[2], offset); w4[1] = amd_bytealign_S (w2[0], w2[1], offset); w4[0] = amd_bytealign_S (w1[3], w2[0], offset); w3[3] = amd_bytealign_S (w1[2], w1[3], offset); w3[2] = amd_bytealign_S (w1[1], w1[2], offset); w3[1] = amd_bytealign_S (w1[0], w1[1], offset); w3[0] = amd_bytealign_S (w0[3], w1[0], offset); w2[3] = amd_bytealign_S (w0[2], w0[3], offset); w2[2] = amd_bytealign_S (w0[1], w0[2], offset); w2[1] = amd_bytealign_S (w0[0], w0[1], offset); w2[0] = amd_bytealign_S ( 0, w0[0], offset); w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 9: c2[1] = amd_bytealign_S (w7[3], 0, offset); c2[0] = amd_bytealign_S (w7[2], w7[3], offset); c1[3] = amd_bytealign_S (w7[1], w7[2], offset); c1[2] = amd_bytealign_S (w7[0], w7[1], offset); c1[1] = amd_bytealign_S (w6[3], w7[0], offset); c1[0] = amd_bytealign_S (w6[2], w6[3], offset); c0[3] = amd_bytealign_S (w6[1], w6[2], offset); c0[2] = amd_bytealign_S (w6[0], w6[1], offset); c0[1] = amd_bytealign_S (w5[3], w6[0], offset); c0[0] = amd_bytealign_S (w5[2], w5[3], offset); w7[3] = amd_bytealign_S (w5[1], w5[2], offset); w7[2] = amd_bytealign_S (w5[0], w5[1], offset); w7[1] = amd_bytealign_S (w4[3], w5[0], offset); w7[0] = amd_bytealign_S (w4[2], w4[3], offset); w6[3] = amd_bytealign_S (w4[1], w4[2], offset); w6[2] = amd_bytealign_S (w4[0], w4[1], offset); w6[1] = amd_bytealign_S (w3[3], w4[0], offset); w6[0] = amd_bytealign_S (w3[2], w3[3], offset); w5[3] = amd_bytealign_S (w3[1], w3[2], offset); w5[2] = amd_bytealign_S (w3[0], w3[1], offset); w5[1] = amd_bytealign_S (w2[3], w3[0], offset); w5[0] = amd_bytealign_S (w2[2], w2[3], offset); w4[3] = amd_bytealign_S (w2[1], w2[2], offset); w4[2] = amd_bytealign_S (w2[0], w2[1], offset); w4[1] = amd_bytealign_S (w1[3], w2[0], offset); w4[0] = amd_bytealign_S (w1[2], w1[3], offset); w3[3] = amd_bytealign_S (w1[1], w1[2], offset); w3[2] = amd_bytealign_S (w1[0], w1[1], offset); w3[1] = amd_bytealign_S (w0[3], w1[0], offset); w3[0] = amd_bytealign_S (w0[2], w0[3], offset); w2[3] = amd_bytealign_S (w0[1], w0[2], offset); w2[2] = amd_bytealign_S (w0[0], w0[1], offset); w2[1] = amd_bytealign_S ( 0, w0[0], offset); w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 10: c2[2] = amd_bytealign_S (w7[3], 0, offset); c2[1] = amd_bytealign_S (w7[2], w7[3], offset); c2[0] = amd_bytealign_S (w7[1], w7[2], offset); c1[3] = amd_bytealign_S (w7[0], w7[1], offset); c1[2] = amd_bytealign_S (w6[3], w7[0], offset); c1[1] = amd_bytealign_S (w6[2], w6[3], offset); c1[0] = amd_bytealign_S (w6[1], w6[2], offset); c0[3] = amd_bytealign_S (w6[0], w6[1], offset); c0[2] = amd_bytealign_S (w5[3], w6[0], offset); c0[1] = amd_bytealign_S (w5[2], w5[3], offset); c0[0] = amd_bytealign_S (w5[1], w5[2], offset); w7[3] = amd_bytealign_S (w5[0], w5[1], offset); w7[2] = amd_bytealign_S (w4[3], w5[0], offset); w7[1] = amd_bytealign_S (w4[2], w4[3], offset); w7[0] = amd_bytealign_S (w4[1], w4[2], offset); w6[3] = amd_bytealign_S (w4[0], w4[1], offset); w6[2] = amd_bytealign_S (w3[3], w4[0], offset); w6[1] = amd_bytealign_S (w3[2], w3[3], offset); w6[0] = amd_bytealign_S (w3[1], w3[2], offset); w5[3] = amd_bytealign_S (w3[0], w3[1], offset); w5[2] = amd_bytealign_S (w2[3], w3[0], offset); w5[1] = amd_bytealign_S (w2[2], w2[3], offset); w5[0] = amd_bytealign_S (w2[1], w2[2], offset); w4[3] = amd_bytealign_S (w2[0], w2[1], offset); w4[2] = amd_bytealign_S (w1[3], w2[0], offset); w4[1] = amd_bytealign_S (w1[2], w1[3], offset); w4[0] = amd_bytealign_S (w1[1], w1[2], offset); w3[3] = amd_bytealign_S (w1[0], w1[1], offset); w3[2] = amd_bytealign_S (w0[3], w1[0], offset); w3[1] = amd_bytealign_S (w0[2], w0[3], offset); w3[0] = amd_bytealign_S (w0[1], w0[2], offset); w2[3] = amd_bytealign_S (w0[0], w0[1], offset); w2[2] = amd_bytealign_S ( 0, w0[0], offset); w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 11: c2[3] = amd_bytealign_S (w7[3], 0, offset); c2[2] = amd_bytealign_S (w7[2], w7[3], offset); c2[1] = amd_bytealign_S (w7[1], w7[2], offset); c2[0] = amd_bytealign_S (w7[0], w7[1], offset); c1[3] = amd_bytealign_S (w6[3], w7[0], offset); c1[2] = amd_bytealign_S (w6[2], w6[3], offset); c1[1] = amd_bytealign_S (w6[1], w6[2], offset); c1[0] = amd_bytealign_S (w6[0], w6[1], offset); c0[3] = amd_bytealign_S (w5[3], w6[0], offset); c0[2] = amd_bytealign_S (w5[2], w5[3], offset); c0[1] = amd_bytealign_S (w5[1], w5[2], offset); c0[0] = amd_bytealign_S (w5[0], w5[1], offset); w7[3] = amd_bytealign_S (w4[3], w5[0], offset); w7[2] = amd_bytealign_S (w4[2], w4[3], offset); w7[1] = amd_bytealign_S (w4[1], w4[2], offset); w7[0] = amd_bytealign_S (w4[0], w4[1], offset); w6[3] = amd_bytealign_S (w3[3], w4[0], offset); w6[2] = amd_bytealign_S (w3[2], w3[3], offset); w6[1] = amd_bytealign_S (w3[1], w3[2], offset); w6[0] = amd_bytealign_S (w3[0], w3[1], offset); w5[3] = amd_bytealign_S (w2[3], w3[0], offset); w5[2] = amd_bytealign_S (w2[2], w2[3], offset); w5[1] = amd_bytealign_S (w2[1], w2[2], offset); w5[0] = amd_bytealign_S (w2[0], w2[1], offset); w4[3] = amd_bytealign_S (w1[3], w2[0], offset); w4[2] = amd_bytealign_S (w1[2], w1[3], offset); w4[1] = amd_bytealign_S (w1[1], w1[2], offset); w4[0] = amd_bytealign_S (w1[0], w1[1], offset); w3[3] = amd_bytealign_S (w0[3], w1[0], offset); w3[2] = amd_bytealign_S (w0[2], w0[3], offset); w3[1] = amd_bytealign_S (w0[1], w0[2], offset); w3[0] = amd_bytealign_S (w0[0], w0[1], offset); w2[3] = amd_bytealign_S ( 0, w0[0], offset); w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 12: c3[0] = amd_bytealign_S (w7[3], 0, offset); c2[3] = amd_bytealign_S (w7[2], w7[3], offset); c2[2] = amd_bytealign_S (w7[1], w7[2], offset); c2[1] = amd_bytealign_S (w7[0], w7[1], offset); c2[0] = amd_bytealign_S (w6[3], w7[0], offset); c1[3] = amd_bytealign_S (w6[2], w6[3], offset); c1[2] = amd_bytealign_S (w6[1], w6[2], offset); c1[1] = amd_bytealign_S (w6[0], w6[1], offset); c1[0] = amd_bytealign_S (w5[3], w6[0], offset); c0[3] = amd_bytealign_S (w5[2], w5[3], offset); c0[2] = amd_bytealign_S (w5[1], w5[2], offset); c0[1] = amd_bytealign_S (w5[0], w5[1], offset); c0[0] = amd_bytealign_S (w4[3], w5[0], offset); w7[3] = amd_bytealign_S (w4[2], w4[3], offset); w7[2] = amd_bytealign_S (w4[1], w4[2], offset); w7[1] = amd_bytealign_S (w4[0], w4[1], offset); w7[0] = amd_bytealign_S (w3[3], w4[0], offset); w6[3] = amd_bytealign_S (w3[2], w3[3], offset); w6[2] = amd_bytealign_S (w3[1], w3[2], offset); w6[1] = amd_bytealign_S (w3[0], w3[1], offset); w6[0] = amd_bytealign_S (w2[3], w3[0], offset); w5[3] = amd_bytealign_S (w2[2], w2[3], offset); w5[2] = amd_bytealign_S (w2[1], w2[2], offset); w5[1] = amd_bytealign_S (w2[0], w2[1], offset); w5[0] = amd_bytealign_S (w1[3], w2[0], offset); w4[3] = amd_bytealign_S (w1[2], w1[3], offset); w4[2] = amd_bytealign_S (w1[1], w1[2], offset); w4[1] = amd_bytealign_S (w1[0], w1[1], offset); w4[0] = amd_bytealign_S (w0[3], w1[0], offset); w3[3] = amd_bytealign_S (w0[2], w0[3], offset); w3[2] = amd_bytealign_S (w0[1], w0[2], offset); w3[1] = amd_bytealign_S (w0[0], w0[1], offset); w3[0] = amd_bytealign_S ( 0, w0[0], offset); w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 13: c3[1] = amd_bytealign_S (w7[3], 0, offset); c3[0] = amd_bytealign_S (w7[2], w7[3], offset); c2[3] = amd_bytealign_S (w7[1], w7[2], offset); c2[2] = amd_bytealign_S (w7[0], w7[1], offset); c2[1] = amd_bytealign_S (w6[3], w7[0], offset); c2[0] = amd_bytealign_S (w6[2], w6[3], offset); c1[3] = amd_bytealign_S (w6[1], w6[2], offset); c1[2] = amd_bytealign_S (w6[0], w6[1], offset); c1[1] = amd_bytealign_S (w5[3], w6[0], offset); c1[0] = amd_bytealign_S (w5[2], w5[3], offset); c0[3] = amd_bytealign_S (w5[1], w5[2], offset); c0[2] = amd_bytealign_S (w5[0], w5[1], offset); c0[1] = amd_bytealign_S (w4[3], w5[0], offset); c0[0] = amd_bytealign_S (w4[2], w4[3], offset); w7[3] = amd_bytealign_S (w4[1], w4[2], offset); w7[2] = amd_bytealign_S (w4[0], w4[1], offset); w7[1] = amd_bytealign_S (w3[3], w4[0], offset); w7[0] = amd_bytealign_S (w3[2], w3[3], offset); w6[3] = amd_bytealign_S (w3[1], w3[2], offset); w6[2] = amd_bytealign_S (w3[0], w3[1], offset); w6[1] = amd_bytealign_S (w2[3], w3[0], offset); w6[0] = amd_bytealign_S (w2[2], w2[3], offset); w5[3] = amd_bytealign_S (w2[1], w2[2], offset); w5[2] = amd_bytealign_S (w2[0], w2[1], offset); w5[1] = amd_bytealign_S (w1[3], w2[0], offset); w5[0] = amd_bytealign_S (w1[2], w1[3], offset); w4[3] = amd_bytealign_S (w1[1], w1[2], offset); w4[2] = amd_bytealign_S (w1[0], w1[1], offset); w4[1] = amd_bytealign_S (w0[3], w1[0], offset); w4[0] = amd_bytealign_S (w0[2], w0[3], offset); w3[3] = amd_bytealign_S (w0[1], w0[2], offset); w3[2] = amd_bytealign_S (w0[0], w0[1], offset); w3[1] = amd_bytealign_S ( 0, w0[0], offset); w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 14: c3[2] = amd_bytealign_S (w7[3], 0, offset); c3[1] = amd_bytealign_S (w7[2], w7[3], offset); c3[0] = amd_bytealign_S (w7[1], w7[2], offset); c2[3] = amd_bytealign_S (w7[0], w7[1], offset); c2[2] = amd_bytealign_S (w6[3], w7[0], offset); c2[1] = amd_bytealign_S (w6[2], w6[3], offset); c2[0] = amd_bytealign_S (w6[1], w6[2], offset); c1[3] = amd_bytealign_S (w6[0], w6[1], offset); c1[2] = amd_bytealign_S (w5[3], w6[0], offset); c1[1] = amd_bytealign_S (w5[2], w5[3], offset); c1[0] = amd_bytealign_S (w5[1], w5[2], offset); c0[3] = amd_bytealign_S (w5[0], w5[1], offset); c0[2] = amd_bytealign_S (w4[3], w5[0], offset); c0[1] = amd_bytealign_S (w4[2], w4[3], offset); c0[0] = amd_bytealign_S (w4[1], w4[2], offset); w7[3] = amd_bytealign_S (w4[0], w4[1], offset); w7[2] = amd_bytealign_S (w3[3], w4[0], offset); w7[1] = amd_bytealign_S (w3[2], w3[3], offset); w7[0] = amd_bytealign_S (w3[1], w3[2], offset); w6[3] = amd_bytealign_S (w3[0], w3[1], offset); w6[2] = amd_bytealign_S (w2[3], w3[0], offset); w6[1] = amd_bytealign_S (w2[2], w2[3], offset); w6[0] = amd_bytealign_S (w2[1], w2[2], offset); w5[3] = amd_bytealign_S (w2[0], w2[1], offset); w5[2] = amd_bytealign_S (w1[3], w2[0], offset); w5[1] = amd_bytealign_S (w1[2], w1[3], offset); w5[0] = amd_bytealign_S (w1[1], w1[2], offset); w4[3] = amd_bytealign_S (w1[0], w1[1], offset); w4[2] = amd_bytealign_S (w0[3], w1[0], offset); w4[1] = amd_bytealign_S (w0[2], w0[3], offset); w4[0] = amd_bytealign_S (w0[1], w0[2], offset); w3[3] = amd_bytealign_S (w0[0], w0[1], offset); w3[2] = amd_bytealign_S ( 0, w0[0], offset); w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 15: c3[3] = amd_bytealign_S (w7[3], 0, offset); c3[2] = amd_bytealign_S (w7[2], w7[3], offset); c3[1] = amd_bytealign_S (w7[1], w7[2], offset); c3[0] = amd_bytealign_S (w7[0], w7[1], offset); c2[3] = amd_bytealign_S (w6[3], w7[0], offset); c2[2] = amd_bytealign_S (w6[2], w6[3], offset); c2[1] = amd_bytealign_S (w6[1], w6[2], offset); c2[0] = amd_bytealign_S (w6[0], w6[1], offset); c1[3] = amd_bytealign_S (w5[3], w6[0], offset); c1[2] = amd_bytealign_S (w5[2], w5[3], offset); c1[1] = amd_bytealign_S (w5[1], w5[2], offset); c1[0] = amd_bytealign_S (w5[0], w5[1], offset); c0[3] = amd_bytealign_S (w4[3], w5[0], offset); c0[2] = amd_bytealign_S (w4[2], w4[3], offset); c0[1] = amd_bytealign_S (w4[1], w4[2], offset); c0[0] = amd_bytealign_S (w4[0], w4[1], offset); w7[3] = amd_bytealign_S (w3[3], w4[0], offset); w7[2] = amd_bytealign_S (w3[2], w3[3], offset); w7[1] = amd_bytealign_S (w3[1], w3[2], offset); w7[0] = amd_bytealign_S (w3[0], w3[1], offset); w6[3] = amd_bytealign_S (w2[3], w3[0], offset); w6[2] = amd_bytealign_S (w2[2], w2[3], offset); w6[1] = amd_bytealign_S (w2[1], w2[2], offset); w6[0] = amd_bytealign_S (w2[0], w2[1], offset); w5[3] = amd_bytealign_S (w1[3], w2[0], offset); w5[2] = amd_bytealign_S (w1[2], w1[3], offset); w5[1] = amd_bytealign_S (w1[1], w1[2], offset); w5[0] = amd_bytealign_S (w1[0], w1[1], offset); w4[3] = amd_bytealign_S (w0[3], w1[0], offset); w4[2] = amd_bytealign_S (w0[2], w0[3], offset); w4[1] = amd_bytealign_S (w0[1], w0[2], offset); w4[0] = amd_bytealign_S (w0[0], w0[1], offset); w3[3] = amd_bytealign_S ( 0, w0[0], offset); w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 16: c4[0] = amd_bytealign_S (w7[3], 0, offset); c3[3] = amd_bytealign_S (w7[2], w7[3], offset); c3[2] = amd_bytealign_S (w7[1], w7[2], offset); c3[1] = amd_bytealign_S (w7[0], w7[1], offset); c3[0] = amd_bytealign_S (w6[3], w7[0], offset); c2[3] = amd_bytealign_S (w6[2], w6[3], offset); c2[2] = amd_bytealign_S (w6[1], w6[2], offset); c2[1] = amd_bytealign_S (w6[0], w6[1], offset); c2[0] = amd_bytealign_S (w5[3], w6[0], offset); c1[3] = amd_bytealign_S (w5[2], w5[3], offset); c1[2] = amd_bytealign_S (w5[1], w5[2], offset); c1[1] = amd_bytealign_S (w5[0], w5[1], offset); c1[0] = amd_bytealign_S (w4[3], w5[0], offset); c0[3] = amd_bytealign_S (w4[2], w4[3], offset); c0[2] = amd_bytealign_S (w4[1], w4[2], offset); c0[1] = amd_bytealign_S (w4[0], w4[1], offset); c0[0] = amd_bytealign_S (w3[3], w4[0], offset); w7[3] = amd_bytealign_S (w3[2], w3[3], offset); w7[2] = amd_bytealign_S (w3[1], w3[2], offset); w7[1] = amd_bytealign_S (w3[0], w3[1], offset); w7[0] = amd_bytealign_S (w2[3], w3[0], offset); w6[3] = amd_bytealign_S (w2[2], w2[3], offset); w6[2] = amd_bytealign_S (w2[1], w2[2], offset); w6[1] = amd_bytealign_S (w2[0], w2[1], offset); w6[0] = amd_bytealign_S (w1[3], w2[0], offset); w5[3] = amd_bytealign_S (w1[2], w1[3], offset); w5[2] = amd_bytealign_S (w1[1], w1[2], offset); w5[1] = amd_bytealign_S (w1[0], w1[1], offset); w5[0] = amd_bytealign_S (w0[3], w1[0], offset); w4[3] = amd_bytealign_S (w0[2], w0[3], offset); w4[2] = amd_bytealign_S (w0[1], w0[2], offset); w4[1] = amd_bytealign_S (w0[0], w0[1], offset); w4[0] = amd_bytealign_S ( 0, w0[0], offset); w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 17: c4[1] = amd_bytealign_S (w7[3], 0, offset); c4[0] = amd_bytealign_S (w7[2], w7[3], offset); c3[3] = amd_bytealign_S (w7[1], w7[2], offset); c3[2] = amd_bytealign_S (w7[0], w7[1], offset); c3[1] = amd_bytealign_S (w6[3], w7[0], offset); c3[0] = amd_bytealign_S (w6[2], w6[3], offset); c2[3] = amd_bytealign_S (w6[1], w6[2], offset); c2[2] = amd_bytealign_S (w6[0], w6[1], offset); c2[1] = amd_bytealign_S (w5[3], w6[0], offset); c2[0] = amd_bytealign_S (w5[2], w5[3], offset); c1[3] = amd_bytealign_S (w5[1], w5[2], offset); c1[2] = amd_bytealign_S (w5[0], w5[1], offset); c1[1] = amd_bytealign_S (w4[3], w5[0], offset); c1[0] = amd_bytealign_S (w4[2], w4[3], offset); c0[3] = amd_bytealign_S (w4[1], w4[2], offset); c0[2] = amd_bytealign_S (w4[0], w4[1], offset); c0[1] = amd_bytealign_S (w3[3], w4[0], offset); c0[0] = amd_bytealign_S (w3[2], w3[3], offset); w7[3] = amd_bytealign_S (w3[1], w3[2], offset); w7[2] = amd_bytealign_S (w3[0], w3[1], offset); w7[1] = amd_bytealign_S (w2[3], w3[0], offset); w7[0] = amd_bytealign_S (w2[2], w2[3], offset); w6[3] = amd_bytealign_S (w2[1], w2[2], offset); w6[2] = amd_bytealign_S (w2[0], w2[1], offset); w6[1] = amd_bytealign_S (w1[3], w2[0], offset); w6[0] = amd_bytealign_S (w1[2], w1[3], offset); w5[3] = amd_bytealign_S (w1[1], w1[2], offset); w5[2] = amd_bytealign_S (w1[0], w1[1], offset); w5[1] = amd_bytealign_S (w0[3], w1[0], offset); w5[0] = amd_bytealign_S (w0[2], w0[3], offset); w4[3] = amd_bytealign_S (w0[1], w0[2], offset); w4[2] = amd_bytealign_S (w0[0], w0[1], offset); w4[1] = amd_bytealign_S ( 0, w0[0], offset); w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 18: c4[2] = amd_bytealign_S (w7[3], 0, offset); c4[1] = amd_bytealign_S (w7[2], w7[3], offset); c4[0] = amd_bytealign_S (w7[1], w7[2], offset); c3[3] = amd_bytealign_S (w7[0], w7[1], offset); c3[2] = amd_bytealign_S (w6[3], w7[0], offset); c3[1] = amd_bytealign_S (w6[2], w6[3], offset); c3[0] = amd_bytealign_S (w6[1], w6[2], offset); c2[3] = amd_bytealign_S (w6[0], w6[1], offset); c2[2] = amd_bytealign_S (w5[3], w6[0], offset); c2[1] = amd_bytealign_S (w5[2], w5[3], offset); c2[0] = amd_bytealign_S (w5[1], w5[2], offset); c1[3] = amd_bytealign_S (w5[0], w5[1], offset); c1[2] = amd_bytealign_S (w4[3], w5[0], offset); c1[1] = amd_bytealign_S (w4[2], w4[3], offset); c1[0] = amd_bytealign_S (w4[1], w4[2], offset); c0[3] = amd_bytealign_S (w4[0], w4[1], offset); c0[2] = amd_bytealign_S (w3[3], w4[0], offset); c0[1] = amd_bytealign_S (w3[2], w3[3], offset); c0[0] = amd_bytealign_S (w3[1], w3[2], offset); w7[3] = amd_bytealign_S (w3[0], w3[1], offset); w7[2] = amd_bytealign_S (w2[3], w3[0], offset); w7[1] = amd_bytealign_S (w2[2], w2[3], offset); w7[0] = amd_bytealign_S (w2[1], w2[2], offset); w6[3] = amd_bytealign_S (w2[0], w2[1], offset); w6[2] = amd_bytealign_S (w1[3], w2[0], offset); w6[1] = amd_bytealign_S (w1[2], w1[3], offset); w6[0] = amd_bytealign_S (w1[1], w1[2], offset); w5[3] = amd_bytealign_S (w1[0], w1[1], offset); w5[2] = amd_bytealign_S (w0[3], w1[0], offset); w5[1] = amd_bytealign_S (w0[2], w0[3], offset); w5[0] = amd_bytealign_S (w0[1], w0[2], offset); w4[3] = amd_bytealign_S (w0[0], w0[1], offset); w4[2] = amd_bytealign_S ( 0, w0[0], offset); w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 19: c4[3] = amd_bytealign_S (w7[3], 0, offset); c4[2] = amd_bytealign_S (w7[2], w7[3], offset); c4[1] = amd_bytealign_S (w7[1], w7[2], offset); c4[0] = amd_bytealign_S (w7[0], w7[1], offset); c3[3] = amd_bytealign_S (w6[3], w7[0], offset); c3[2] = amd_bytealign_S (w6[2], w6[3], offset); c3[1] = amd_bytealign_S (w6[1], w6[2], offset); c3[0] = amd_bytealign_S (w6[0], w6[1], offset); c2[3] = amd_bytealign_S (w5[3], w6[0], offset); c2[2] = amd_bytealign_S (w5[2], w5[3], offset); c2[1] = amd_bytealign_S (w5[1], w5[2], offset); c2[0] = amd_bytealign_S (w5[0], w5[1], offset); c1[3] = amd_bytealign_S (w4[3], w5[0], offset); c1[2] = amd_bytealign_S (w4[2], w4[3], offset); c1[1] = amd_bytealign_S (w4[1], w4[2], offset); c1[0] = amd_bytealign_S (w4[0], w4[1], offset); c0[3] = amd_bytealign_S (w3[3], w4[0], offset); c0[2] = amd_bytealign_S (w3[2], w3[3], offset); c0[1] = amd_bytealign_S (w3[1], w3[2], offset); c0[0] = amd_bytealign_S (w3[0], w3[1], offset); w7[3] = amd_bytealign_S (w2[3], w3[0], offset); w7[2] = amd_bytealign_S (w2[2], w2[3], offset); w7[1] = amd_bytealign_S (w2[1], w2[2], offset); w7[0] = amd_bytealign_S (w2[0], w2[1], offset); w6[3] = amd_bytealign_S (w1[3], w2[0], offset); w6[2] = amd_bytealign_S (w1[2], w1[3], offset); w6[1] = amd_bytealign_S (w1[1], w1[2], offset); w6[0] = amd_bytealign_S (w1[0], w1[1], offset); w5[3] = amd_bytealign_S (w0[3], w1[0], offset); w5[2] = amd_bytealign_S (w0[2], w0[3], offset); w5[1] = amd_bytealign_S (w0[1], w0[2], offset); w5[0] = amd_bytealign_S (w0[0], w0[1], offset); w4[3] = amd_bytealign_S ( 0, w0[0], offset); w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 20: c5[0] = amd_bytealign_S (w7[3], 0, offset); c4[3] = amd_bytealign_S (w7[2], w7[3], offset); c4[2] = amd_bytealign_S (w7[1], w7[2], offset); c4[1] = amd_bytealign_S (w7[0], w7[1], offset); c4[0] = amd_bytealign_S (w6[3], w7[0], offset); c3[3] = amd_bytealign_S (w6[2], w6[3], offset); c3[2] = amd_bytealign_S (w6[1], w6[2], offset); c3[1] = amd_bytealign_S (w6[0], w6[1], offset); c3[0] = amd_bytealign_S (w5[3], w6[0], offset); c2[3] = amd_bytealign_S (w5[2], w5[3], offset); c2[2] = amd_bytealign_S (w5[1], w5[2], offset); c2[1] = amd_bytealign_S (w5[0], w5[1], offset); c2[0] = amd_bytealign_S (w4[3], w5[0], offset); c1[3] = amd_bytealign_S (w4[2], w4[3], offset); c1[2] = amd_bytealign_S (w4[1], w4[2], offset); c1[1] = amd_bytealign_S (w4[0], w4[1], offset); c1[0] = amd_bytealign_S (w3[3], w4[0], offset); c0[3] = amd_bytealign_S (w3[2], w3[3], offset); c0[2] = amd_bytealign_S (w3[1], w3[2], offset); c0[1] = amd_bytealign_S (w3[0], w3[1], offset); c0[0] = amd_bytealign_S (w2[3], w3[0], offset); w7[3] = amd_bytealign_S (w2[2], w2[3], offset); w7[2] = amd_bytealign_S (w2[1], w2[2], offset); w7[1] = amd_bytealign_S (w2[0], w2[1], offset); w7[0] = amd_bytealign_S (w1[3], w2[0], offset); w6[3] = amd_bytealign_S (w1[2], w1[3], offset); w6[2] = amd_bytealign_S (w1[1], w1[2], offset); w6[1] = amd_bytealign_S (w1[0], w1[1], offset); w6[0] = amd_bytealign_S (w0[3], w1[0], offset); w5[3] = amd_bytealign_S (w0[2], w0[3], offset); w5[2] = amd_bytealign_S (w0[1], w0[2], offset); w5[1] = amd_bytealign_S (w0[0], w0[1], offset); w5[0] = amd_bytealign_S ( 0, w0[0], offset); w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 21: c5[1] = amd_bytealign_S (w7[3], 0, offset); c5[0] = amd_bytealign_S (w7[2], w7[3], offset); c4[3] = amd_bytealign_S (w7[1], w7[2], offset); c4[2] = amd_bytealign_S (w7[0], w7[1], offset); c4[1] = amd_bytealign_S (w6[3], w7[0], offset); c4[0] = amd_bytealign_S (w6[2], w6[3], offset); c3[3] = amd_bytealign_S (w6[1], w6[2], offset); c3[2] = amd_bytealign_S (w6[0], w6[1], offset); c3[1] = amd_bytealign_S (w5[3], w6[0], offset); c3[0] = amd_bytealign_S (w5[2], w5[3], offset); c2[3] = amd_bytealign_S (w5[1], w5[2], offset); c2[2] = amd_bytealign_S (w5[0], w5[1], offset); c2[1] = amd_bytealign_S (w4[3], w5[0], offset); c2[0] = amd_bytealign_S (w4[2], w4[3], offset); c1[3] = amd_bytealign_S (w4[1], w4[2], offset); c1[2] = amd_bytealign_S (w4[0], w4[1], offset); c1[1] = amd_bytealign_S (w3[3], w4[0], offset); c1[0] = amd_bytealign_S (w3[2], w3[3], offset); c0[3] = amd_bytealign_S (w3[1], w3[2], offset); c0[2] = amd_bytealign_S (w3[0], w3[1], offset); c0[1] = amd_bytealign_S (w2[3], w3[0], offset); c0[0] = amd_bytealign_S (w2[2], w2[3], offset); w7[3] = amd_bytealign_S (w2[1], w2[2], offset); w7[2] = amd_bytealign_S (w2[0], w2[1], offset); w7[1] = amd_bytealign_S (w1[3], w2[0], offset); w7[0] = amd_bytealign_S (w1[2], w1[3], offset); w6[3] = amd_bytealign_S (w1[1], w1[2], offset); w6[2] = amd_bytealign_S (w1[0], w1[1], offset); w6[1] = amd_bytealign_S (w0[3], w1[0], offset); w6[0] = amd_bytealign_S (w0[2], w0[3], offset); w5[3] = amd_bytealign_S (w0[1], w0[2], offset); w5[2] = amd_bytealign_S (w0[0], w0[1], offset); w5[1] = amd_bytealign_S ( 0, w0[0], offset); w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 22: c5[2] = amd_bytealign_S (w7[3], 0, offset); c5[1] = amd_bytealign_S (w7[2], w7[3], offset); c5[0] = amd_bytealign_S (w7[1], w7[2], offset); c4[3] = amd_bytealign_S (w7[0], w7[1], offset); c4[2] = amd_bytealign_S (w6[3], w7[0], offset); c4[1] = amd_bytealign_S (w6[2], w6[3], offset); c4[0] = amd_bytealign_S (w6[1], w6[2], offset); c3[3] = amd_bytealign_S (w6[0], w6[1], offset); c3[2] = amd_bytealign_S (w5[3], w6[0], offset); c3[1] = amd_bytealign_S (w5[2], w5[3], offset); c3[0] = amd_bytealign_S (w5[1], w5[2], offset); c2[3] = amd_bytealign_S (w5[0], w5[1], offset); c2[2] = amd_bytealign_S (w4[3], w5[0], offset); c2[1] = amd_bytealign_S (w4[2], w4[3], offset); c2[0] = amd_bytealign_S (w4[1], w4[2], offset); c1[3] = amd_bytealign_S (w4[0], w4[1], offset); c1[2] = amd_bytealign_S (w3[3], w4[0], offset); c1[1] = amd_bytealign_S (w3[2], w3[3], offset); c1[0] = amd_bytealign_S (w3[1], w3[2], offset); c0[3] = amd_bytealign_S (w3[0], w3[1], offset); c0[2] = amd_bytealign_S (w2[3], w3[0], offset); c0[1] = amd_bytealign_S (w2[2], w2[3], offset); c0[0] = amd_bytealign_S (w2[1], w2[2], offset); w7[3] = amd_bytealign_S (w2[0], w2[1], offset); w7[2] = amd_bytealign_S (w1[3], w2[0], offset); w7[1] = amd_bytealign_S (w1[2], w1[3], offset); w7[0] = amd_bytealign_S (w1[1], w1[2], offset); w6[3] = amd_bytealign_S (w1[0], w1[1], offset); w6[2] = amd_bytealign_S (w0[3], w1[0], offset); w6[1] = amd_bytealign_S (w0[2], w0[3], offset); w6[0] = amd_bytealign_S (w0[1], w0[2], offset); w5[3] = amd_bytealign_S (w0[0], w0[1], offset); w5[2] = amd_bytealign_S ( 0, w0[0], offset); w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 23: c5[3] = amd_bytealign_S (w7[3], 0, offset); c5[2] = amd_bytealign_S (w7[2], w7[3], offset); c5[1] = amd_bytealign_S (w7[1], w7[2], offset); c5[0] = amd_bytealign_S (w7[0], w7[1], offset); c4[3] = amd_bytealign_S (w6[3], w7[0], offset); c4[2] = amd_bytealign_S (w6[2], w6[3], offset); c4[1] = amd_bytealign_S (w6[1], w6[2], offset); c4[0] = amd_bytealign_S (w6[0], w6[1], offset); c3[3] = amd_bytealign_S (w5[3], w6[0], offset); c3[2] = amd_bytealign_S (w5[2], w5[3], offset); c3[1] = amd_bytealign_S (w5[1], w5[2], offset); c3[0] = amd_bytealign_S (w5[0], w5[1], offset); c2[3] = amd_bytealign_S (w4[3], w5[0], offset); c2[2] = amd_bytealign_S (w4[2], w4[3], offset); c2[1] = amd_bytealign_S (w4[1], w4[2], offset); c2[0] = amd_bytealign_S (w4[0], w4[1], offset); c1[3] = amd_bytealign_S (w3[3], w4[0], offset); c1[2] = amd_bytealign_S (w3[2], w3[3], offset); c1[1] = amd_bytealign_S (w3[1], w3[2], offset); c1[0] = amd_bytealign_S (w3[0], w3[1], offset); c0[3] = amd_bytealign_S (w2[3], w3[0], offset); c0[2] = amd_bytealign_S (w2[2], w2[3], offset); c0[1] = amd_bytealign_S (w2[1], w2[2], offset); c0[0] = amd_bytealign_S (w2[0], w2[1], offset); w7[3] = amd_bytealign_S (w1[3], w2[0], offset); w7[2] = amd_bytealign_S (w1[2], w1[3], offset); w7[1] = amd_bytealign_S (w1[1], w1[2], offset); w7[0] = amd_bytealign_S (w1[0], w1[1], offset); w6[3] = amd_bytealign_S (w0[3], w1[0], offset); w6[2] = amd_bytealign_S (w0[2], w0[3], offset); w6[1] = amd_bytealign_S (w0[1], w0[2], offset); w6[0] = amd_bytealign_S (w0[0], w0[1], offset); w5[3] = amd_bytealign_S ( 0, w0[0], offset); w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 24: c6[0] = amd_bytealign_S (w7[3], 0, offset); c5[3] = amd_bytealign_S (w7[2], w7[3], offset); c5[2] = amd_bytealign_S (w7[1], w7[2], offset); c5[1] = amd_bytealign_S (w7[0], w7[1], offset); c5[0] = amd_bytealign_S (w6[3], w7[0], offset); c4[3] = amd_bytealign_S (w6[2], w6[3], offset); c4[2] = amd_bytealign_S (w6[1], w6[2], offset); c4[1] = amd_bytealign_S (w6[0], w6[1], offset); c4[0] = amd_bytealign_S (w5[3], w6[0], offset); c3[3] = amd_bytealign_S (w5[2], w5[3], offset); c3[2] = amd_bytealign_S (w5[1], w5[2], offset); c3[1] = amd_bytealign_S (w5[0], w5[1], offset); c3[0] = amd_bytealign_S (w4[3], w5[0], offset); c2[3] = amd_bytealign_S (w4[2], w4[3], offset); c2[2] = amd_bytealign_S (w4[1], w4[2], offset); c2[1] = amd_bytealign_S (w4[0], w4[1], offset); c2[0] = amd_bytealign_S (w3[3], w4[0], offset); c1[3] = amd_bytealign_S (w3[2], w3[3], offset); c1[2] = amd_bytealign_S (w3[1], w3[2], offset); c1[1] = amd_bytealign_S (w3[0], w3[1], offset); c1[0] = amd_bytealign_S (w2[3], w3[0], offset); c0[3] = amd_bytealign_S (w2[2], w2[3], offset); c0[2] = amd_bytealign_S (w2[1], w2[2], offset); c0[1] = amd_bytealign_S (w2[0], w2[1], offset); c0[0] = amd_bytealign_S (w1[3], w2[0], offset); w7[3] = amd_bytealign_S (w1[2], w1[3], offset); w7[2] = amd_bytealign_S (w1[1], w1[2], offset); w7[1] = amd_bytealign_S (w1[0], w1[1], offset); w7[0] = amd_bytealign_S (w0[3], w1[0], offset); w6[3] = amd_bytealign_S (w0[2], w0[3], offset); w6[2] = amd_bytealign_S (w0[1], w0[2], offset); w6[1] = amd_bytealign_S (w0[0], w0[1], offset); w6[0] = amd_bytealign_S ( 0, w0[0], offset); w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 25: c6[1] = amd_bytealign_S (w7[3], 0, offset); c6[0] = amd_bytealign_S (w7[2], w7[3], offset); c5[3] = amd_bytealign_S (w7[1], w7[2], offset); c5[2] = amd_bytealign_S (w7[0], w7[1], offset); c5[1] = amd_bytealign_S (w6[3], w7[0], offset); c5[0] = amd_bytealign_S (w6[2], w6[3], offset); c4[3] = amd_bytealign_S (w6[1], w6[2], offset); c4[2] = amd_bytealign_S (w6[0], w6[1], offset); c4[1] = amd_bytealign_S (w5[3], w6[0], offset); c4[0] = amd_bytealign_S (w5[2], w5[3], offset); c3[3] = amd_bytealign_S (w5[1], w5[2], offset); c3[2] = amd_bytealign_S (w5[0], w5[1], offset); c3[1] = amd_bytealign_S (w4[3], w5[0], offset); c3[0] = amd_bytealign_S (w4[2], w4[3], offset); c2[3] = amd_bytealign_S (w4[1], w4[2], offset); c2[2] = amd_bytealign_S (w4[0], w4[1], offset); c2[1] = amd_bytealign_S (w3[3], w4[0], offset); c2[0] = amd_bytealign_S (w3[2], w3[3], offset); c1[3] = amd_bytealign_S (w3[1], w3[2], offset); c1[2] = amd_bytealign_S (w3[0], w3[1], offset); c1[1] = amd_bytealign_S (w2[3], w3[0], offset); c1[0] = amd_bytealign_S (w2[2], w2[3], offset); c0[3] = amd_bytealign_S (w2[1], w2[2], offset); c0[2] = amd_bytealign_S (w2[0], w2[1], offset); c0[1] = amd_bytealign_S (w1[3], w2[0], offset); c0[0] = amd_bytealign_S (w1[2], w1[3], offset); w7[3] = amd_bytealign_S (w1[1], w1[2], offset); w7[2] = amd_bytealign_S (w1[0], w1[1], offset); w7[1] = amd_bytealign_S (w0[3], w1[0], offset); w7[0] = amd_bytealign_S (w0[2], w0[3], offset); w6[3] = amd_bytealign_S (w0[1], w0[2], offset); w6[2] = amd_bytealign_S (w0[0], w0[1], offset); w6[1] = amd_bytealign_S ( 0, w0[0], offset); w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 26: c6[2] = amd_bytealign_S (w7[3], 0, offset); c6[1] = amd_bytealign_S (w7[2], w7[3], offset); c6[0] = amd_bytealign_S (w7[1], w7[2], offset); c5[3] = amd_bytealign_S (w7[0], w7[1], offset); c5[2] = amd_bytealign_S (w6[3], w7[0], offset); c5[1] = amd_bytealign_S (w6[2], w6[3], offset); c5[0] = amd_bytealign_S (w6[1], w6[2], offset); c4[3] = amd_bytealign_S (w6[0], w6[1], offset); c4[2] = amd_bytealign_S (w5[3], w6[0], offset); c4[1] = amd_bytealign_S (w5[2], w5[3], offset); c4[0] = amd_bytealign_S (w5[1], w5[2], offset); c3[3] = amd_bytealign_S (w5[0], w5[1], offset); c3[2] = amd_bytealign_S (w4[3], w5[0], offset); c3[1] = amd_bytealign_S (w4[2], w4[3], offset); c3[0] = amd_bytealign_S (w4[1], w4[2], offset); c2[3] = amd_bytealign_S (w4[0], w4[1], offset); c2[2] = amd_bytealign_S (w3[3], w4[0], offset); c2[1] = amd_bytealign_S (w3[2], w3[3], offset); c2[0] = amd_bytealign_S (w3[1], w3[2], offset); c1[3] = amd_bytealign_S (w3[0], w3[1], offset); c1[2] = amd_bytealign_S (w2[3], w3[0], offset); c1[1] = amd_bytealign_S (w2[2], w2[3], offset); c1[0] = amd_bytealign_S (w2[1], w2[2], offset); c0[3] = amd_bytealign_S (w2[0], w2[1], offset); c0[2] = amd_bytealign_S (w1[3], w2[0], offset); c0[1] = amd_bytealign_S (w1[2], w1[3], offset); c0[0] = amd_bytealign_S (w1[1], w1[2], offset); w7[3] = amd_bytealign_S (w1[0], w1[1], offset); w7[2] = amd_bytealign_S (w0[3], w1[0], offset); w7[1] = amd_bytealign_S (w0[2], w0[3], offset); w7[0] = amd_bytealign_S (w0[1], w0[2], offset); w6[3] = amd_bytealign_S (w0[0], w0[1], offset); w6[2] = amd_bytealign_S ( 0, w0[0], offset); w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 27: c6[3] = amd_bytealign_S (w7[3], 0, offset); c6[2] = amd_bytealign_S (w7[2], w7[3], offset); c6[1] = amd_bytealign_S (w7[1], w7[2], offset); c6[0] = amd_bytealign_S (w7[0], w7[1], offset); c5[3] = amd_bytealign_S (w6[3], w7[0], offset); c5[2] = amd_bytealign_S (w6[2], w6[3], offset); c5[1] = amd_bytealign_S (w6[1], w6[2], offset); c5[0] = amd_bytealign_S (w6[0], w6[1], offset); c4[3] = amd_bytealign_S (w5[3], w6[0], offset); c4[2] = amd_bytealign_S (w5[2], w5[3], offset); c4[1] = amd_bytealign_S (w5[1], w5[2], offset); c4[0] = amd_bytealign_S (w5[0], w5[1], offset); c3[3] = amd_bytealign_S (w4[3], w5[0], offset); c3[2] = amd_bytealign_S (w4[2], w4[3], offset); c3[1] = amd_bytealign_S (w4[1], w4[2], offset); c3[0] = amd_bytealign_S (w4[0], w4[1], offset); c2[3] = amd_bytealign_S (w3[3], w4[0], offset); c2[2] = amd_bytealign_S (w3[2], w3[3], offset); c2[1] = amd_bytealign_S (w3[1], w3[2], offset); c2[0] = amd_bytealign_S (w3[0], w3[1], offset); c1[3] = amd_bytealign_S (w2[3], w3[0], offset); c1[2] = amd_bytealign_S (w2[2], w2[3], offset); c1[1] = amd_bytealign_S (w2[1], w2[2], offset); c1[0] = amd_bytealign_S (w2[0], w2[1], offset); c0[3] = amd_bytealign_S (w1[3], w2[0], offset); c0[2] = amd_bytealign_S (w1[2], w1[3], offset); c0[1] = amd_bytealign_S (w1[1], w1[2], offset); c0[0] = amd_bytealign_S (w1[0], w1[1], offset); w7[3] = amd_bytealign_S (w0[3], w1[0], offset); w7[2] = amd_bytealign_S (w0[2], w0[3], offset); w7[1] = amd_bytealign_S (w0[1], w0[2], offset); w7[0] = amd_bytealign_S (w0[0], w0[1], offset); w6[3] = amd_bytealign_S ( 0, w0[0], offset); w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 28: c7[0] = amd_bytealign_S (w7[3], 0, offset); c6[3] = amd_bytealign_S (w7[2], w7[3], offset); c6[2] = amd_bytealign_S (w7[1], w7[2], offset); c6[1] = amd_bytealign_S (w7[0], w7[1], offset); c6[0] = amd_bytealign_S (w6[3], w7[0], offset); c5[3] = amd_bytealign_S (w6[2], w6[3], offset); c5[2] = amd_bytealign_S (w6[1], w6[2], offset); c5[1] = amd_bytealign_S (w6[0], w6[1], offset); c5[0] = amd_bytealign_S (w5[3], w6[0], offset); c4[3] = amd_bytealign_S (w5[2], w5[3], offset); c4[2] = amd_bytealign_S (w5[1], w5[2], offset); c4[1] = amd_bytealign_S (w5[0], w5[1], offset); c4[0] = amd_bytealign_S (w4[3], w5[0], offset); c3[3] = amd_bytealign_S (w4[2], w4[3], offset); c3[2] = amd_bytealign_S (w4[1], w4[2], offset); c3[1] = amd_bytealign_S (w4[0], w4[1], offset); c3[0] = amd_bytealign_S (w3[3], w4[0], offset); c2[3] = amd_bytealign_S (w3[2], w3[3], offset); c2[2] = amd_bytealign_S (w3[1], w3[2], offset); c2[1] = amd_bytealign_S (w3[0], w3[1], offset); c2[0] = amd_bytealign_S (w2[3], w3[0], offset); c1[3] = amd_bytealign_S (w2[2], w2[3], offset); c1[2] = amd_bytealign_S (w2[1], w2[2], offset); c1[1] = amd_bytealign_S (w2[0], w2[1], offset); c1[0] = amd_bytealign_S (w1[3], w2[0], offset); c0[3] = amd_bytealign_S (w1[2], w1[3], offset); c0[2] = amd_bytealign_S (w1[1], w1[2], offset); c0[1] = amd_bytealign_S (w1[0], w1[1], offset); c0[0] = amd_bytealign_S (w0[3], w1[0], offset); w7[3] = amd_bytealign_S (w0[2], w0[3], offset); w7[2] = amd_bytealign_S (w0[1], w0[2], offset); w7[1] = amd_bytealign_S (w0[0], w0[1], offset); w7[0] = amd_bytealign_S ( 0, w0[0], offset); w6[3] = 0; w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 29: c7[1] = amd_bytealign_S (w7[3], 0, offset); c7[0] = amd_bytealign_S (w7[2], w7[3], offset); c6[3] = amd_bytealign_S (w7[1], w7[2], offset); c6[2] = amd_bytealign_S (w7[0], w7[1], offset); c6[1] = amd_bytealign_S (w6[3], w7[0], offset); c6[0] = amd_bytealign_S (w6[2], w6[3], offset); c5[3] = amd_bytealign_S (w6[1], w6[2], offset); c5[2] = amd_bytealign_S (w6[0], w6[1], offset); c5[1] = amd_bytealign_S (w5[3], w6[0], offset); c5[0] = amd_bytealign_S (w5[2], w5[3], offset); c4[3] = amd_bytealign_S (w5[1], w5[2], offset); c4[2] = amd_bytealign_S (w5[0], w5[1], offset); c4[1] = amd_bytealign_S (w4[3], w5[0], offset); c4[0] = amd_bytealign_S (w4[2], w4[3], offset); c3[3] = amd_bytealign_S (w4[1], w4[2], offset); c3[2] = amd_bytealign_S (w4[0], w4[1], offset); c3[1] = amd_bytealign_S (w3[3], w4[0], offset); c3[0] = amd_bytealign_S (w3[2], w3[3], offset); c2[3] = amd_bytealign_S (w3[1], w3[2], offset); c2[2] = amd_bytealign_S (w3[0], w3[1], offset); c2[1] = amd_bytealign_S (w2[3], w3[0], offset); c2[0] = amd_bytealign_S (w2[2], w2[3], offset); c1[3] = amd_bytealign_S (w2[1], w2[2], offset); c1[2] = amd_bytealign_S (w2[0], w2[1], offset); c1[1] = amd_bytealign_S (w1[3], w2[0], offset); c1[0] = amd_bytealign_S (w1[2], w1[3], offset); c0[3] = amd_bytealign_S (w1[1], w1[2], offset); c0[2] = amd_bytealign_S (w1[0], w1[1], offset); c0[1] = amd_bytealign_S (w0[3], w1[0], offset); c0[0] = amd_bytealign_S (w0[2], w0[3], offset); w7[3] = amd_bytealign_S (w0[1], w0[2], offset); w7[2] = amd_bytealign_S (w0[0], w0[1], offset); w7[1] = amd_bytealign_S ( 0, w0[0], offset); w7[0] = 0; w6[3] = 0; w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 30: c7[2] = amd_bytealign_S (w7[3], 0, offset); c7[1] = amd_bytealign_S (w7[2], w7[3], offset); c7[0] = amd_bytealign_S (w7[1], w7[2], offset); c6[3] = amd_bytealign_S (w7[0], w7[1], offset); c6[2] = amd_bytealign_S (w6[3], w7[0], offset); c6[1] = amd_bytealign_S (w6[2], w6[3], offset); c6[0] = amd_bytealign_S (w6[1], w6[2], offset); c5[3] = amd_bytealign_S (w6[0], w6[1], offset); c5[2] = amd_bytealign_S (w5[3], w6[0], offset); c5[1] = amd_bytealign_S (w5[2], w5[3], offset); c5[0] = amd_bytealign_S (w5[1], w5[2], offset); c4[3] = amd_bytealign_S (w5[0], w5[1], offset); c4[2] = amd_bytealign_S (w4[3], w5[0], offset); c4[1] = amd_bytealign_S (w4[2], w4[3], offset); c4[0] = amd_bytealign_S (w4[1], w4[2], offset); c3[3] = amd_bytealign_S (w4[0], w4[1], offset); c3[2] = amd_bytealign_S (w3[3], w4[0], offset); c3[1] = amd_bytealign_S (w3[2], w3[3], offset); c3[0] = amd_bytealign_S (w3[1], w3[2], offset); c2[3] = amd_bytealign_S (w3[0], w3[1], offset); c2[2] = amd_bytealign_S (w2[3], w3[0], offset); c2[1] = amd_bytealign_S (w2[2], w2[3], offset); c2[0] = amd_bytealign_S (w2[1], w2[2], offset); c1[3] = amd_bytealign_S (w2[0], w2[1], offset); c1[2] = amd_bytealign_S (w1[3], w2[0], offset); c1[1] = amd_bytealign_S (w1[2], w1[3], offset); c1[0] = amd_bytealign_S (w1[1], w1[2], offset); c0[3] = amd_bytealign_S (w1[0], w1[1], offset); c0[2] = amd_bytealign_S (w0[3], w1[0], offset); c0[1] = amd_bytealign_S (w0[2], w0[3], offset); c0[0] = amd_bytealign_S (w0[1], w0[2], offset); w7[3] = amd_bytealign_S (w0[0], w0[1], offset); w7[2] = amd_bytealign_S ( 0, w0[0], offset); w7[1] = 0; w7[0] = 0; w6[3] = 0; w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 31: c7[3] = amd_bytealign_S (w7[3], 0, offset); c7[2] = amd_bytealign_S (w7[2], w7[3], offset); c7[1] = amd_bytealign_S (w7[1], w7[2], offset); c7[0] = amd_bytealign_S (w7[0], w7[1], offset); c6[3] = amd_bytealign_S (w6[3], w7[0], offset); c6[2] = amd_bytealign_S (w6[2], w6[3], offset); c6[1] = amd_bytealign_S (w6[1], w6[2], offset); c6[0] = amd_bytealign_S (w6[0], w6[1], offset); c5[3] = amd_bytealign_S (w5[3], w6[0], offset); c5[2] = amd_bytealign_S (w5[2], w5[3], offset); c5[1] = amd_bytealign_S (w5[1], w5[2], offset); c5[0] = amd_bytealign_S (w5[0], w5[1], offset); c4[3] = amd_bytealign_S (w4[3], w5[0], offset); c4[2] = amd_bytealign_S (w4[2], w4[3], offset); c4[1] = amd_bytealign_S (w4[1], w4[2], offset); c4[0] = amd_bytealign_S (w4[0], w4[1], offset); c3[3] = amd_bytealign_S (w3[3], w4[0], offset); c3[2] = amd_bytealign_S (w3[2], w3[3], offset); c3[1] = amd_bytealign_S (w3[1], w3[2], offset); c3[0] = amd_bytealign_S (w3[0], w3[1], offset); c2[3] = amd_bytealign_S (w2[3], w3[0], offset); c2[2] = amd_bytealign_S (w2[2], w2[3], offset); c2[1] = amd_bytealign_S (w2[1], w2[2], offset); c2[0] = amd_bytealign_S (w2[0], w2[1], offset); c1[3] = amd_bytealign_S (w1[3], w2[0], offset); c1[2] = amd_bytealign_S (w1[2], w1[3], offset); c1[1] = amd_bytealign_S (w1[1], w1[2], offset); c1[0] = amd_bytealign_S (w1[0], w1[1], offset); c0[3] = amd_bytealign_S (w0[3], w1[0], offset); c0[2] = amd_bytealign_S (w0[2], w0[3], offset); c0[1] = amd_bytealign_S (w0[1], w0[2], offset); c0[0] = amd_bytealign_S (w0[0], w0[1], offset); w7[3] = amd_bytealign_S ( 0, w0[0], offset); w7[2] = 0; w7[1] = 0; w7[0] = 0; w6[3] = 0; w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; } #endif #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV #if defined IS_NV const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; #endif #if defined IS_AMD const int selector = 0x0706050403020100 >> ((offset & 3) * 8); #endif switch (offset_switch) { case 0: c0[0] = __byte_perm_S ( 0, w7[3], selector); w7[3] = __byte_perm_S (w7[3], w7[2], selector); w7[2] = __byte_perm_S (w7[2], w7[1], selector); w7[1] = __byte_perm_S (w7[1], w7[0], selector); w7[0] = __byte_perm_S (w7[0], w6[3], selector); w6[3] = __byte_perm_S (w6[3], w6[2], selector); w6[2] = __byte_perm_S (w6[2], w6[1], selector); w6[1] = __byte_perm_S (w6[1], w6[0], selector); w6[0] = __byte_perm_S (w6[0], w5[3], selector); w5[3] = __byte_perm_S (w5[3], w5[2], selector); w5[2] = __byte_perm_S (w5[2], w5[1], selector); w5[1] = __byte_perm_S (w5[1], w5[0], selector); w5[0] = __byte_perm_S (w5[0], w4[3], selector); w4[3] = __byte_perm_S (w4[3], w4[2], selector); w4[2] = __byte_perm_S (w4[2], w4[1], selector); w4[1] = __byte_perm_S (w4[1], w4[0], selector); w4[0] = __byte_perm_S (w4[0], w3[3], selector); w3[3] = __byte_perm_S (w3[3], w3[2], selector); w3[2] = __byte_perm_S (w3[2], w3[1], selector); w3[1] = __byte_perm_S (w3[1], w3[0], selector); w3[0] = __byte_perm_S (w3[0], w2[3], selector); w2[3] = __byte_perm_S (w2[3], w2[2], selector); w2[2] = __byte_perm_S (w2[2], w2[1], selector); w2[1] = __byte_perm_S (w2[1], w2[0], selector); w2[0] = __byte_perm_S (w2[0], w1[3], selector); w1[3] = __byte_perm_S (w1[3], w1[2], selector); w1[2] = __byte_perm_S (w1[2], w1[1], selector); w1[1] = __byte_perm_S (w1[1], w1[0], selector); w1[0] = __byte_perm_S (w1[0], w0[3], selector); w0[3] = __byte_perm_S (w0[3], w0[2], selector); w0[2] = __byte_perm_S (w0[2], w0[1], selector); w0[1] = __byte_perm_S (w0[1], w0[0], selector); w0[0] = __byte_perm_S (w0[0], 0, selector); break; case 1: c0[1] = __byte_perm_S ( 0, w7[3], selector); c0[0] = __byte_perm_S (w7[3], w7[2], selector); w7[3] = __byte_perm_S (w7[2], w7[1], selector); w7[2] = __byte_perm_S (w7[1], w7[0], selector); w7[1] = __byte_perm_S (w7[0], w6[3], selector); w7[0] = __byte_perm_S (w6[3], w6[2], selector); w6[3] = __byte_perm_S (w6[2], w6[1], selector); w6[2] = __byte_perm_S (w6[1], w6[0], selector); w6[1] = __byte_perm_S (w6[0], w5[3], selector); w6[0] = __byte_perm_S (w5[3], w5[2], selector); w5[3] = __byte_perm_S (w5[2], w5[1], selector); w5[2] = __byte_perm_S (w5[1], w5[0], selector); w5[1] = __byte_perm_S (w5[0], w4[3], selector); w5[0] = __byte_perm_S (w4[3], w4[2], selector); w4[3] = __byte_perm_S (w4[2], w4[1], selector); w4[2] = __byte_perm_S (w4[1], w4[0], selector); w4[1] = __byte_perm_S (w4[0], w3[3], selector); w4[0] = __byte_perm_S (w3[3], w3[2], selector); w3[3] = __byte_perm_S (w3[2], w3[1], selector); w3[2] = __byte_perm_S (w3[1], w3[0], selector); w3[1] = __byte_perm_S (w3[0], w2[3], selector); w3[0] = __byte_perm_S (w2[3], w2[2], selector); w2[3] = __byte_perm_S (w2[2], w2[1], selector); w2[2] = __byte_perm_S (w2[1], w2[0], selector); w2[1] = __byte_perm_S (w2[0], w1[3], selector); w2[0] = __byte_perm_S (w1[3], w1[2], selector); w1[3] = __byte_perm_S (w1[2], w1[1], selector); w1[2] = __byte_perm_S (w1[1], w1[0], selector); w1[1] = __byte_perm_S (w1[0], w0[3], selector); w1[0] = __byte_perm_S (w0[3], w0[2], selector); w0[3] = __byte_perm_S (w0[2], w0[1], selector); w0[2] = __byte_perm_S (w0[1], w0[0], selector); w0[1] = __byte_perm_S (w0[0], 0, selector); w0[0] = 0; break; case 2: c0[2] = __byte_perm_S ( 0, w7[3], selector); c0[1] = __byte_perm_S (w7[3], w7[2], selector); c0[0] = __byte_perm_S (w7[2], w7[1], selector); w7[3] = __byte_perm_S (w7[1], w7[0], selector); w7[2] = __byte_perm_S (w7[0], w6[3], selector); w7[1] = __byte_perm_S (w6[3], w6[2], selector); w7[0] = __byte_perm_S (w6[2], w6[1], selector); w6[3] = __byte_perm_S (w6[1], w6[0], selector); w6[2] = __byte_perm_S (w6[0], w5[3], selector); w6[1] = __byte_perm_S (w5[3], w5[2], selector); w6[0] = __byte_perm_S (w5[2], w5[1], selector); w5[3] = __byte_perm_S (w5[1], w5[0], selector); w5[2] = __byte_perm_S (w5[0], w4[3], selector); w5[1] = __byte_perm_S (w4[3], w4[2], selector); w5[0] = __byte_perm_S (w4[2], w4[1], selector); w4[3] = __byte_perm_S (w4[1], w4[0], selector); w4[2] = __byte_perm_S (w4[0], w3[3], selector); w4[1] = __byte_perm_S (w3[3], w3[2], selector); w4[0] = __byte_perm_S (w3[2], w3[1], selector); w3[3] = __byte_perm_S (w3[1], w3[0], selector); w3[2] = __byte_perm_S (w3[0], w2[3], selector); w3[1] = __byte_perm_S (w2[3], w2[2], selector); w3[0] = __byte_perm_S (w2[2], w2[1], selector); w2[3] = __byte_perm_S (w2[1], w2[0], selector); w2[2] = __byte_perm_S (w2[0], w1[3], selector); w2[1] = __byte_perm_S (w1[3], w1[2], selector); w2[0] = __byte_perm_S (w1[2], w1[1], selector); w1[3] = __byte_perm_S (w1[1], w1[0], selector); w1[2] = __byte_perm_S (w1[0], w0[3], selector); w1[1] = __byte_perm_S (w0[3], w0[2], selector); w1[0] = __byte_perm_S (w0[2], w0[1], selector); w0[3] = __byte_perm_S (w0[1], w0[0], selector); w0[2] = __byte_perm_S (w0[0], 0, selector); w0[1] = 0; w0[0] = 0; break; case 3: c0[3] = __byte_perm_S ( 0, w7[3], selector); c0[2] = __byte_perm_S (w7[3], w7[2], selector); c0[1] = __byte_perm_S (w7[2], w7[1], selector); c0[0] = __byte_perm_S (w7[1], w7[0], selector); w7[3] = __byte_perm_S (w7[0], w6[3], selector); w7[2] = __byte_perm_S (w6[3], w6[2], selector); w7[1] = __byte_perm_S (w6[2], w6[1], selector); w7[0] = __byte_perm_S (w6[1], w6[0], selector); w6[3] = __byte_perm_S (w6[0], w5[3], selector); w6[2] = __byte_perm_S (w5[3], w5[2], selector); w6[1] = __byte_perm_S (w5[2], w5[1], selector); w6[0] = __byte_perm_S (w5[1], w5[0], selector); w5[3] = __byte_perm_S (w5[0], w4[3], selector); w5[2] = __byte_perm_S (w4[3], w4[2], selector); w5[1] = __byte_perm_S (w4[2], w4[1], selector); w5[0] = __byte_perm_S (w4[1], w4[0], selector); w4[3] = __byte_perm_S (w4[0], w3[3], selector); w4[2] = __byte_perm_S (w3[3], w3[2], selector); w4[1] = __byte_perm_S (w3[2], w3[1], selector); w4[0] = __byte_perm_S (w3[1], w3[0], selector); w3[3] = __byte_perm_S (w3[0], w2[3], selector); w3[2] = __byte_perm_S (w2[3], w2[2], selector); w3[1] = __byte_perm_S (w2[2], w2[1], selector); w3[0] = __byte_perm_S (w2[1], w2[0], selector); w2[3] = __byte_perm_S (w2[0], w1[3], selector); w2[2] = __byte_perm_S (w1[3], w1[2], selector); w2[1] = __byte_perm_S (w1[2], w1[1], selector); w2[0] = __byte_perm_S (w1[1], w1[0], selector); w1[3] = __byte_perm_S (w1[0], w0[3], selector); w1[2] = __byte_perm_S (w0[3], w0[2], selector); w1[1] = __byte_perm_S (w0[2], w0[1], selector); w1[0] = __byte_perm_S (w0[1], w0[0], selector); w0[3] = __byte_perm_S (w0[0], 0, selector); w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 4: c1[0] = __byte_perm_S ( 0, w7[3], selector); c0[3] = __byte_perm_S (w7[3], w7[2], selector); c0[2] = __byte_perm_S (w7[2], w7[1], selector); c0[1] = __byte_perm_S (w7[1], w7[0], selector); c0[0] = __byte_perm_S (w7[0], w6[3], selector); w7[3] = __byte_perm_S (w6[3], w6[2], selector); w7[2] = __byte_perm_S (w6[2], w6[1], selector); w7[1] = __byte_perm_S (w6[1], w6[0], selector); w7[0] = __byte_perm_S (w6[0], w5[3], selector); w6[3] = __byte_perm_S (w5[3], w5[2], selector); w6[2] = __byte_perm_S (w5[2], w5[1], selector); w6[1] = __byte_perm_S (w5[1], w5[0], selector); w6[0] = __byte_perm_S (w5[0], w4[3], selector); w5[3] = __byte_perm_S (w4[3], w4[2], selector); w5[2] = __byte_perm_S (w4[2], w4[1], selector); w5[1] = __byte_perm_S (w4[1], w4[0], selector); w5[0] = __byte_perm_S (w4[0], w3[3], selector); w4[3] = __byte_perm_S (w3[3], w3[2], selector); w4[2] = __byte_perm_S (w3[2], w3[1], selector); w4[1] = __byte_perm_S (w3[1], w3[0], selector); w4[0] = __byte_perm_S (w3[0], w2[3], selector); w3[3] = __byte_perm_S (w2[3], w2[2], selector); w3[2] = __byte_perm_S (w2[2], w2[1], selector); w3[1] = __byte_perm_S (w2[1], w2[0], selector); w3[0] = __byte_perm_S (w2[0], w1[3], selector); w2[3] = __byte_perm_S (w1[3], w1[2], selector); w2[2] = __byte_perm_S (w1[2], w1[1], selector); w2[1] = __byte_perm_S (w1[1], w1[0], selector); w2[0] = __byte_perm_S (w1[0], w0[3], selector); w1[3] = __byte_perm_S (w0[3], w0[2], selector); w1[2] = __byte_perm_S (w0[2], w0[1], selector); w1[1] = __byte_perm_S (w0[1], w0[0], selector); w1[0] = __byte_perm_S (w0[0], 0, selector); w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 5: c1[1] = __byte_perm_S ( 0, w7[3], selector); c1[0] = __byte_perm_S (w7[3], w7[2], selector); c0[3] = __byte_perm_S (w7[2], w7[1], selector); c0[2] = __byte_perm_S (w7[1], w7[0], selector); c0[1] = __byte_perm_S (w7[0], w6[3], selector); c0[0] = __byte_perm_S (w6[3], w6[2], selector); w7[3] = __byte_perm_S (w6[2], w6[1], selector); w7[2] = __byte_perm_S (w6[1], w6[0], selector); w7[1] = __byte_perm_S (w6[0], w5[3], selector); w7[0] = __byte_perm_S (w5[3], w5[2], selector); w6[3] = __byte_perm_S (w5[2], w5[1], selector); w6[2] = __byte_perm_S (w5[1], w5[0], selector); w6[1] = __byte_perm_S (w5[0], w4[3], selector); w6[0] = __byte_perm_S (w4[3], w4[2], selector); w5[3] = __byte_perm_S (w4[2], w4[1], selector); w5[2] = __byte_perm_S (w4[1], w4[0], selector); w5[1] = __byte_perm_S (w4[0], w3[3], selector); w5[0] = __byte_perm_S (w3[3], w3[2], selector); w4[3] = __byte_perm_S (w3[2], w3[1], selector); w4[2] = __byte_perm_S (w3[1], w3[0], selector); w4[1] = __byte_perm_S (w3[0], w2[3], selector); w4[0] = __byte_perm_S (w2[3], w2[2], selector); w3[3] = __byte_perm_S (w2[2], w2[1], selector); w3[2] = __byte_perm_S (w2[1], w2[0], selector); w3[1] = __byte_perm_S (w2[0], w1[3], selector); w3[0] = __byte_perm_S (w1[3], w1[2], selector); w2[3] = __byte_perm_S (w1[2], w1[1], selector); w2[2] = __byte_perm_S (w1[1], w1[0], selector); w2[1] = __byte_perm_S (w1[0], w0[3], selector); w2[0] = __byte_perm_S (w0[3], w0[2], selector); w1[3] = __byte_perm_S (w0[2], w0[1], selector); w1[2] = __byte_perm_S (w0[1], w0[0], selector); w1[1] = __byte_perm_S (w0[0], 0, selector); w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 6: c1[2] = __byte_perm_S ( 0, w7[3], selector); c1[1] = __byte_perm_S (w7[3], w7[2], selector); c1[0] = __byte_perm_S (w7[2], w7[1], selector); c0[3] = __byte_perm_S (w7[1], w7[0], selector); c0[2] = __byte_perm_S (w7[0], w6[3], selector); c0[1] = __byte_perm_S (w6[3], w6[2], selector); c0[0] = __byte_perm_S (w6[2], w6[1], selector); w7[3] = __byte_perm_S (w6[1], w6[0], selector); w7[2] = __byte_perm_S (w6[0], w5[3], selector); w7[1] = __byte_perm_S (w5[3], w5[2], selector); w7[0] = __byte_perm_S (w5[2], w5[1], selector); w6[3] = __byte_perm_S (w5[1], w5[0], selector); w6[2] = __byte_perm_S (w5[0], w4[3], selector); w6[1] = __byte_perm_S (w4[3], w4[2], selector); w6[0] = __byte_perm_S (w4[2], w4[1], selector); w5[3] = __byte_perm_S (w4[1], w4[0], selector); w5[2] = __byte_perm_S (w4[0], w3[3], selector); w5[1] = __byte_perm_S (w3[3], w3[2], selector); w5[0] = __byte_perm_S (w3[2], w3[1], selector); w4[3] = __byte_perm_S (w3[1], w3[0], selector); w4[2] = __byte_perm_S (w3[0], w2[3], selector); w4[1] = __byte_perm_S (w2[3], w2[2], selector); w4[0] = __byte_perm_S (w2[2], w2[1], selector); w3[3] = __byte_perm_S (w2[1], w2[0], selector); w3[2] = __byte_perm_S (w2[0], w1[3], selector); w3[1] = __byte_perm_S (w1[3], w1[2], selector); w3[0] = __byte_perm_S (w1[2], w1[1], selector); w2[3] = __byte_perm_S (w1[1], w1[0], selector); w2[2] = __byte_perm_S (w1[0], w0[3], selector); w2[1] = __byte_perm_S (w0[3], w0[2], selector); w2[0] = __byte_perm_S (w0[2], w0[1], selector); w1[3] = __byte_perm_S (w0[1], w0[0], selector); w1[2] = __byte_perm_S (w0[0], 0, selector); w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 7: c1[3] = __byte_perm_S ( 0, w7[3], selector); c1[2] = __byte_perm_S (w7[3], w7[2], selector); c1[1] = __byte_perm_S (w7[2], w7[1], selector); c1[0] = __byte_perm_S (w7[1], w7[0], selector); c0[3] = __byte_perm_S (w7[0], w6[3], selector); c0[2] = __byte_perm_S (w6[3], w6[2], selector); c0[1] = __byte_perm_S (w6[2], w6[1], selector); c0[0] = __byte_perm_S (w6[1], w6[0], selector); w7[3] = __byte_perm_S (w6[0], w5[3], selector); w7[2] = __byte_perm_S (w5[3], w5[2], selector); w7[1] = __byte_perm_S (w5[2], w5[1], selector); w7[0] = __byte_perm_S (w5[1], w5[0], selector); w6[3] = __byte_perm_S (w5[0], w4[3], selector); w6[2] = __byte_perm_S (w4[3], w4[2], selector); w6[1] = __byte_perm_S (w4[2], w4[1], selector); w6[0] = __byte_perm_S (w4[1], w4[0], selector); w5[3] = __byte_perm_S (w4[0], w3[3], selector); w5[2] = __byte_perm_S (w3[3], w3[2], selector); w5[1] = __byte_perm_S (w3[2], w3[1], selector); w5[0] = __byte_perm_S (w3[1], w3[0], selector); w4[3] = __byte_perm_S (w3[0], w2[3], selector); w4[2] = __byte_perm_S (w2[3], w2[2], selector); w4[1] = __byte_perm_S (w2[2], w2[1], selector); w4[0] = __byte_perm_S (w2[1], w2[0], selector); w3[3] = __byte_perm_S (w2[0], w1[3], selector); w3[2] = __byte_perm_S (w1[3], w1[2], selector); w3[1] = __byte_perm_S (w1[2], w1[1], selector); w3[0] = __byte_perm_S (w1[1], w1[0], selector); w2[3] = __byte_perm_S (w1[0], w0[3], selector); w2[2] = __byte_perm_S (w0[3], w0[2], selector); w2[1] = __byte_perm_S (w0[2], w0[1], selector); w2[0] = __byte_perm_S (w0[1], w0[0], selector); w1[3] = __byte_perm_S (w0[0], 0, selector); w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 8: c2[0] = __byte_perm_S ( 0, w7[3], selector); c1[3] = __byte_perm_S (w7[3], w7[2], selector); c1[2] = __byte_perm_S (w7[2], w7[1], selector); c1[1] = __byte_perm_S (w7[1], w7[0], selector); c1[0] = __byte_perm_S (w7[0], w6[3], selector); c0[3] = __byte_perm_S (w6[3], w6[2], selector); c0[2] = __byte_perm_S (w6[2], w6[1], selector); c0[1] = __byte_perm_S (w6[1], w6[0], selector); c0[0] = __byte_perm_S (w6[0], w5[3], selector); w7[3] = __byte_perm_S (w5[3], w5[2], selector); w7[2] = __byte_perm_S (w5[2], w5[1], selector); w7[1] = __byte_perm_S (w5[1], w5[0], selector); w7[0] = __byte_perm_S (w5[0], w4[3], selector); w6[3] = __byte_perm_S (w4[3], w4[2], selector); w6[2] = __byte_perm_S (w4[2], w4[1], selector); w6[1] = __byte_perm_S (w4[1], w4[0], selector); w6[0] = __byte_perm_S (w4[0], w3[3], selector); w5[3] = __byte_perm_S (w3[3], w3[2], selector); w5[2] = __byte_perm_S (w3[2], w3[1], selector); w5[1] = __byte_perm_S (w3[1], w3[0], selector); w5[0] = __byte_perm_S (w3[0], w2[3], selector); w4[3] = __byte_perm_S (w2[3], w2[2], selector); w4[2] = __byte_perm_S (w2[2], w2[1], selector); w4[1] = __byte_perm_S (w2[1], w2[0], selector); w4[0] = __byte_perm_S (w2[0], w1[3], selector); w3[3] = __byte_perm_S (w1[3], w1[2], selector); w3[2] = __byte_perm_S (w1[2], w1[1], selector); w3[1] = __byte_perm_S (w1[1], w1[0], selector); w3[0] = __byte_perm_S (w1[0], w0[3], selector); w2[3] = __byte_perm_S (w0[3], w0[2], selector); w2[2] = __byte_perm_S (w0[2], w0[1], selector); w2[1] = __byte_perm_S (w0[1], w0[0], selector); w2[0] = __byte_perm_S (w0[0], 0, selector); w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 9: c2[1] = __byte_perm_S ( 0, w7[3], selector); c2[0] = __byte_perm_S (w7[3], w7[2], selector); c1[3] = __byte_perm_S (w7[2], w7[1], selector); c1[2] = __byte_perm_S (w7[1], w7[0], selector); c1[1] = __byte_perm_S (w7[0], w6[3], selector); c1[0] = __byte_perm_S (w6[3], w6[2], selector); c0[3] = __byte_perm_S (w6[2], w6[1], selector); c0[2] = __byte_perm_S (w6[1], w6[0], selector); c0[1] = __byte_perm_S (w6[0], w5[3], selector); c0[0] = __byte_perm_S (w5[3], w5[2], selector); w7[3] = __byte_perm_S (w5[2], w5[1], selector); w7[2] = __byte_perm_S (w5[1], w5[0], selector); w7[1] = __byte_perm_S (w5[0], w4[3], selector); w7[0] = __byte_perm_S (w4[3], w4[2], selector); w6[3] = __byte_perm_S (w4[2], w4[1], selector); w6[2] = __byte_perm_S (w4[1], w4[0], selector); w6[1] = __byte_perm_S (w4[0], w3[3], selector); w6[0] = __byte_perm_S (w3[3], w3[2], selector); w5[3] = __byte_perm_S (w3[2], w3[1], selector); w5[2] = __byte_perm_S (w3[1], w3[0], selector); w5[1] = __byte_perm_S (w3[0], w2[3], selector); w5[0] = __byte_perm_S (w2[3], w2[2], selector); w4[3] = __byte_perm_S (w2[2], w2[1], selector); w4[2] = __byte_perm_S (w2[1], w2[0], selector); w4[1] = __byte_perm_S (w2[0], w1[3], selector); w4[0] = __byte_perm_S (w1[3], w1[2], selector); w3[3] = __byte_perm_S (w1[2], w1[1], selector); w3[2] = __byte_perm_S (w1[1], w1[0], selector); w3[1] = __byte_perm_S (w1[0], w0[3], selector); w3[0] = __byte_perm_S (w0[3], w0[2], selector); w2[3] = __byte_perm_S (w0[2], w0[1], selector); w2[2] = __byte_perm_S (w0[1], w0[0], selector); w2[1] = __byte_perm_S (w0[0], 0, selector); w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 10: c2[2] = __byte_perm_S ( 0, w7[3], selector); c2[1] = __byte_perm_S (w7[3], w7[2], selector); c2[0] = __byte_perm_S (w7[2], w7[1], selector); c1[3] = __byte_perm_S (w7[1], w7[0], selector); c1[2] = __byte_perm_S (w7[0], w6[3], selector); c1[1] = __byte_perm_S (w6[3], w6[2], selector); c1[0] = __byte_perm_S (w6[2], w6[1], selector); c0[3] = __byte_perm_S (w6[1], w6[0], selector); c0[2] = __byte_perm_S (w6[0], w5[3], selector); c0[1] = __byte_perm_S (w5[3], w5[2], selector); c0[0] = __byte_perm_S (w5[2], w5[1], selector); w7[3] = __byte_perm_S (w5[1], w5[0], selector); w7[2] = __byte_perm_S (w5[0], w4[3], selector); w7[1] = __byte_perm_S (w4[3], w4[2], selector); w7[0] = __byte_perm_S (w4[2], w4[1], selector); w6[3] = __byte_perm_S (w4[1], w4[0], selector); w6[2] = __byte_perm_S (w4[0], w3[3], selector); w6[1] = __byte_perm_S (w3[3], w3[2], selector); w6[0] = __byte_perm_S (w3[2], w3[1], selector); w5[3] = __byte_perm_S (w3[1], w3[0], selector); w5[2] = __byte_perm_S (w3[0], w2[3], selector); w5[1] = __byte_perm_S (w2[3], w2[2], selector); w5[0] = __byte_perm_S (w2[2], w2[1], selector); w4[3] = __byte_perm_S (w2[1], w2[0], selector); w4[2] = __byte_perm_S (w2[0], w1[3], selector); w4[1] = __byte_perm_S (w1[3], w1[2], selector); w4[0] = __byte_perm_S (w1[2], w1[1], selector); w3[3] = __byte_perm_S (w1[1], w1[0], selector); w3[2] = __byte_perm_S (w1[0], w0[3], selector); w3[1] = __byte_perm_S (w0[3], w0[2], selector); w3[0] = __byte_perm_S (w0[2], w0[1], selector); w2[3] = __byte_perm_S (w0[1], w0[0], selector); w2[2] = __byte_perm_S (w0[0], 0, selector); w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 11: c2[3] = __byte_perm_S ( 0, w7[3], selector); c2[2] = __byte_perm_S (w7[3], w7[2], selector); c2[1] = __byte_perm_S (w7[2], w7[1], selector); c2[0] = __byte_perm_S (w7[1], w7[0], selector); c1[3] = __byte_perm_S (w7[0], w6[3], selector); c1[2] = __byte_perm_S (w6[3], w6[2], selector); c1[1] = __byte_perm_S (w6[2], w6[1], selector); c1[0] = __byte_perm_S (w6[1], w6[0], selector); c0[3] = __byte_perm_S (w6[0], w5[3], selector); c0[2] = __byte_perm_S (w5[3], w5[2], selector); c0[1] = __byte_perm_S (w5[2], w5[1], selector); c0[0] = __byte_perm_S (w5[1], w5[0], selector); w7[3] = __byte_perm_S (w5[0], w4[3], selector); w7[2] = __byte_perm_S (w4[3], w4[2], selector); w7[1] = __byte_perm_S (w4[2], w4[1], selector); w7[0] = __byte_perm_S (w4[1], w4[0], selector); w6[3] = __byte_perm_S (w4[0], w3[3], selector); w6[2] = __byte_perm_S (w3[3], w3[2], selector); w6[1] = __byte_perm_S (w3[2], w3[1], selector); w6[0] = __byte_perm_S (w3[1], w3[0], selector); w5[3] = __byte_perm_S (w3[0], w2[3], selector); w5[2] = __byte_perm_S (w2[3], w2[2], selector); w5[1] = __byte_perm_S (w2[2], w2[1], selector); w5[0] = __byte_perm_S (w2[1], w2[0], selector); w4[3] = __byte_perm_S (w2[0], w1[3], selector); w4[2] = __byte_perm_S (w1[3], w1[2], selector); w4[1] = __byte_perm_S (w1[2], w1[1], selector); w4[0] = __byte_perm_S (w1[1], w1[0], selector); w3[3] = __byte_perm_S (w1[0], w0[3], selector); w3[2] = __byte_perm_S (w0[3], w0[2], selector); w3[1] = __byte_perm_S (w0[2], w0[1], selector); w3[0] = __byte_perm_S (w0[1], w0[0], selector); w2[3] = __byte_perm_S (w0[0], 0, selector); w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 12: c3[0] = __byte_perm_S ( 0, w7[3], selector); c2[3] = __byte_perm_S (w7[3], w7[2], selector); c2[2] = __byte_perm_S (w7[2], w7[1], selector); c2[1] = __byte_perm_S (w7[1], w7[0], selector); c2[0] = __byte_perm_S (w7[0], w6[3], selector); c1[3] = __byte_perm_S (w6[3], w6[2], selector); c1[2] = __byte_perm_S (w6[2], w6[1], selector); c1[1] = __byte_perm_S (w6[1], w6[0], selector); c1[0] = __byte_perm_S (w6[0], w5[3], selector); c0[3] = __byte_perm_S (w5[3], w5[2], selector); c0[2] = __byte_perm_S (w5[2], w5[1], selector); c0[1] = __byte_perm_S (w5[1], w5[0], selector); c0[0] = __byte_perm_S (w5[0], w4[3], selector); w7[3] = __byte_perm_S (w4[3], w4[2], selector); w7[2] = __byte_perm_S (w4[2], w4[1], selector); w7[1] = __byte_perm_S (w4[1], w4[0], selector); w7[0] = __byte_perm_S (w4[0], w3[3], selector); w6[3] = __byte_perm_S (w3[3], w3[2], selector); w6[2] = __byte_perm_S (w3[2], w3[1], selector); w6[1] = __byte_perm_S (w3[1], w3[0], selector); w6[0] = __byte_perm_S (w3[0], w2[3], selector); w5[3] = __byte_perm_S (w2[3], w2[2], selector); w5[2] = __byte_perm_S (w2[2], w2[1], selector); w5[1] = __byte_perm_S (w2[1], w2[0], selector); w5[0] = __byte_perm_S (w2[0], w1[3], selector); w4[3] = __byte_perm_S (w1[3], w1[2], selector); w4[2] = __byte_perm_S (w1[2], w1[1], selector); w4[1] = __byte_perm_S (w1[1], w1[0], selector); w4[0] = __byte_perm_S (w1[0], w0[3], selector); w3[3] = __byte_perm_S (w0[3], w0[2], selector); w3[2] = __byte_perm_S (w0[2], w0[1], selector); w3[1] = __byte_perm_S (w0[1], w0[0], selector); w3[0] = __byte_perm_S (w0[0], 0, selector); w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 13: c3[1] = __byte_perm_S ( 0, w7[3], selector); c3[0] = __byte_perm_S (w7[3], w7[2], selector); c2[3] = __byte_perm_S (w7[2], w7[1], selector); c2[2] = __byte_perm_S (w7[1], w7[0], selector); c2[1] = __byte_perm_S (w7[0], w6[3], selector); c2[0] = __byte_perm_S (w6[3], w6[2], selector); c1[3] = __byte_perm_S (w6[2], w6[1], selector); c1[2] = __byte_perm_S (w6[1], w6[0], selector); c1[1] = __byte_perm_S (w6[0], w5[3], selector); c1[0] = __byte_perm_S (w5[3], w5[2], selector); c0[3] = __byte_perm_S (w5[2], w5[1], selector); c0[2] = __byte_perm_S (w5[1], w5[0], selector); c0[1] = __byte_perm_S (w5[0], w4[3], selector); c0[0] = __byte_perm_S (w4[3], w4[2], selector); w7[3] = __byte_perm_S (w4[2], w4[1], selector); w7[2] = __byte_perm_S (w4[1], w4[0], selector); w7[1] = __byte_perm_S (w4[0], w3[3], selector); w7[0] = __byte_perm_S (w3[3], w3[2], selector); w6[3] = __byte_perm_S (w3[2], w3[1], selector); w6[2] = __byte_perm_S (w3[1], w3[0], selector); w6[1] = __byte_perm_S (w3[0], w2[3], selector); w6[0] = __byte_perm_S (w2[3], w2[2], selector); w5[3] = __byte_perm_S (w2[2], w2[1], selector); w5[2] = __byte_perm_S (w2[1], w2[0], selector); w5[1] = __byte_perm_S (w2[0], w1[3], selector); w5[0] = __byte_perm_S (w1[3], w1[2], selector); w4[3] = __byte_perm_S (w1[2], w1[1], selector); w4[2] = __byte_perm_S (w1[1], w1[0], selector); w4[1] = __byte_perm_S (w1[0], w0[3], selector); w4[0] = __byte_perm_S (w0[3], w0[2], selector); w3[3] = __byte_perm_S (w0[2], w0[1], selector); w3[2] = __byte_perm_S (w0[1], w0[0], selector); w3[1] = __byte_perm_S (w0[0], 0, selector); w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 14: c3[2] = __byte_perm_S ( 0, w7[3], selector); c3[1] = __byte_perm_S (w7[3], w7[2], selector); c3[0] = __byte_perm_S (w7[2], w7[1], selector); c2[3] = __byte_perm_S (w7[1], w7[0], selector); c2[2] = __byte_perm_S (w7[0], w6[3], selector); c2[1] = __byte_perm_S (w6[3], w6[2], selector); c2[0] = __byte_perm_S (w6[2], w6[1], selector); c1[3] = __byte_perm_S (w6[1], w6[0], selector); c1[2] = __byte_perm_S (w6[0], w5[3], selector); c1[1] = __byte_perm_S (w5[3], w5[2], selector); c1[0] = __byte_perm_S (w5[2], w5[1], selector); c0[3] = __byte_perm_S (w5[1], w5[0], selector); c0[2] = __byte_perm_S (w5[0], w4[3], selector); c0[1] = __byte_perm_S (w4[3], w4[2], selector); c0[0] = __byte_perm_S (w4[2], w4[1], selector); w7[3] = __byte_perm_S (w4[1], w4[0], selector); w7[2] = __byte_perm_S (w4[0], w3[3], selector); w7[1] = __byte_perm_S (w3[3], w3[2], selector); w7[0] = __byte_perm_S (w3[2], w3[1], selector); w6[3] = __byte_perm_S (w3[1], w3[0], selector); w6[2] = __byte_perm_S (w3[0], w2[3], selector); w6[1] = __byte_perm_S (w2[3], w2[2], selector); w6[0] = __byte_perm_S (w2[2], w2[1], selector); w5[3] = __byte_perm_S (w2[1], w2[0], selector); w5[2] = __byte_perm_S (w2[0], w1[3], selector); w5[1] = __byte_perm_S (w1[3], w1[2], selector); w5[0] = __byte_perm_S (w1[2], w1[1], selector); w4[3] = __byte_perm_S (w1[1], w1[0], selector); w4[2] = __byte_perm_S (w1[0], w0[3], selector); w4[1] = __byte_perm_S (w0[3], w0[2], selector); w4[0] = __byte_perm_S (w0[2], w0[1], selector); w3[3] = __byte_perm_S (w0[1], w0[0], selector); w3[2] = __byte_perm_S (w0[0], 0, selector); w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 15: c3[3] = __byte_perm_S ( 0, w7[3], selector); c3[2] = __byte_perm_S (w7[3], w7[2], selector); c3[1] = __byte_perm_S (w7[2], w7[1], selector); c3[0] = __byte_perm_S (w7[1], w7[0], selector); c2[3] = __byte_perm_S (w7[0], w6[3], selector); c2[2] = __byte_perm_S (w6[3], w6[2], selector); c2[1] = __byte_perm_S (w6[2], w6[1], selector); c2[0] = __byte_perm_S (w6[1], w6[0], selector); c1[3] = __byte_perm_S (w6[0], w5[3], selector); c1[2] = __byte_perm_S (w5[3], w5[2], selector); c1[1] = __byte_perm_S (w5[2], w5[1], selector); c1[0] = __byte_perm_S (w5[1], w5[0], selector); c0[3] = __byte_perm_S (w5[0], w4[3], selector); c0[2] = __byte_perm_S (w4[3], w4[2], selector); c0[1] = __byte_perm_S (w4[2], w4[1], selector); c0[0] = __byte_perm_S (w4[1], w4[0], selector); w7[3] = __byte_perm_S (w4[0], w3[3], selector); w7[2] = __byte_perm_S (w3[3], w3[2], selector); w7[1] = __byte_perm_S (w3[2], w3[1], selector); w7[0] = __byte_perm_S (w3[1], w3[0], selector); w6[3] = __byte_perm_S (w3[0], w2[3], selector); w6[2] = __byte_perm_S (w2[3], w2[2], selector); w6[1] = __byte_perm_S (w2[2], w2[1], selector); w6[0] = __byte_perm_S (w2[1], w2[0], selector); w5[3] = __byte_perm_S (w2[0], w1[3], selector); w5[2] = __byte_perm_S (w1[3], w1[2], selector); w5[1] = __byte_perm_S (w1[2], w1[1], selector); w5[0] = __byte_perm_S (w1[1], w1[0], selector); w4[3] = __byte_perm_S (w1[0], w0[3], selector); w4[2] = __byte_perm_S (w0[3], w0[2], selector); w4[1] = __byte_perm_S (w0[2], w0[1], selector); w4[0] = __byte_perm_S (w0[1], w0[0], selector); w3[3] = __byte_perm_S (w0[0], 0, selector); w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 16: c4[0] = __byte_perm_S ( 0, w7[3], selector); c3[3] = __byte_perm_S (w7[3], w7[2], selector); c3[2] = __byte_perm_S (w7[2], w7[1], selector); c3[1] = __byte_perm_S (w7[1], w7[0], selector); c3[0] = __byte_perm_S (w7[0], w6[3], selector); c2[3] = __byte_perm_S (w6[3], w6[2], selector); c2[2] = __byte_perm_S (w6[2], w6[1], selector); c2[1] = __byte_perm_S (w6[1], w6[0], selector); c2[0] = __byte_perm_S (w6[0], w5[3], selector); c1[3] = __byte_perm_S (w5[3], w5[2], selector); c1[2] = __byte_perm_S (w5[2], w5[1], selector); c1[1] = __byte_perm_S (w5[1], w5[0], selector); c1[0] = __byte_perm_S (w5[0], w4[3], selector); c0[3] = __byte_perm_S (w4[3], w4[2], selector); c0[2] = __byte_perm_S (w4[2], w4[1], selector); c0[1] = __byte_perm_S (w4[1], w4[0], selector); c0[0] = __byte_perm_S (w4[0], w3[3], selector); w7[3] = __byte_perm_S (w3[3], w3[2], selector); w7[2] = __byte_perm_S (w3[2], w3[1], selector); w7[1] = __byte_perm_S (w3[1], w3[0], selector); w7[0] = __byte_perm_S (w3[0], w2[3], selector); w6[3] = __byte_perm_S (w2[3], w2[2], selector); w6[2] = __byte_perm_S (w2[2], w2[1], selector); w6[1] = __byte_perm_S (w2[1], w2[0], selector); w6[0] = __byte_perm_S (w2[0], w1[3], selector); w5[3] = __byte_perm_S (w1[3], w1[2], selector); w5[2] = __byte_perm_S (w1[2], w1[1], selector); w5[1] = __byte_perm_S (w1[1], w1[0], selector); w5[0] = __byte_perm_S (w1[0], w0[3], selector); w4[3] = __byte_perm_S (w0[3], w0[2], selector); w4[2] = __byte_perm_S (w0[2], w0[1], selector); w4[1] = __byte_perm_S (w0[1], w0[0], selector); w4[0] = __byte_perm_S (w0[0], 0, selector); w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 17: c4[1] = __byte_perm_S ( 0, w7[3], selector); c4[0] = __byte_perm_S (w7[3], w7[2], selector); c3[3] = __byte_perm_S (w7[2], w7[1], selector); c3[2] = __byte_perm_S (w7[1], w7[0], selector); c3[1] = __byte_perm_S (w7[0], w6[3], selector); c3[0] = __byte_perm_S (w6[3], w6[2], selector); c2[3] = __byte_perm_S (w6[2], w6[1], selector); c2[2] = __byte_perm_S (w6[1], w6[0], selector); c2[1] = __byte_perm_S (w6[0], w5[3], selector); c2[0] = __byte_perm_S (w5[3], w5[2], selector); c1[3] = __byte_perm_S (w5[2], w5[1], selector); c1[2] = __byte_perm_S (w5[1], w5[0], selector); c1[1] = __byte_perm_S (w5[0], w4[3], selector); c1[0] = __byte_perm_S (w4[3], w4[2], selector); c0[3] = __byte_perm_S (w4[2], w4[1], selector); c0[2] = __byte_perm_S (w4[1], w4[0], selector); c0[1] = __byte_perm_S (w4[0], w3[3], selector); c0[0] = __byte_perm_S (w3[3], w3[2], selector); w7[3] = __byte_perm_S (w3[2], w3[1], selector); w7[2] = __byte_perm_S (w3[1], w3[0], selector); w7[1] = __byte_perm_S (w3[0], w2[3], selector); w7[0] = __byte_perm_S (w2[3], w2[2], selector); w6[3] = __byte_perm_S (w2[2], w2[1], selector); w6[2] = __byte_perm_S (w2[1], w2[0], selector); w6[1] = __byte_perm_S (w2[0], w1[3], selector); w6[0] = __byte_perm_S (w1[3], w1[2], selector); w5[3] = __byte_perm_S (w1[2], w1[1], selector); w5[2] = __byte_perm_S (w1[1], w1[0], selector); w5[1] = __byte_perm_S (w1[0], w0[3], selector); w5[0] = __byte_perm_S (w0[3], w0[2], selector); w4[3] = __byte_perm_S (w0[2], w0[1], selector); w4[2] = __byte_perm_S (w0[1], w0[0], selector); w4[1] = __byte_perm_S (w0[0], 0, selector); w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 18: c4[2] = __byte_perm_S ( 0, w7[3], selector); c4[1] = __byte_perm_S (w7[3], w7[2], selector); c4[0] = __byte_perm_S (w7[2], w7[1], selector); c3[3] = __byte_perm_S (w7[1], w7[0], selector); c3[2] = __byte_perm_S (w7[0], w6[3], selector); c3[1] = __byte_perm_S (w6[3], w6[2], selector); c3[0] = __byte_perm_S (w6[2], w6[1], selector); c2[3] = __byte_perm_S (w6[1], w6[0], selector); c2[2] = __byte_perm_S (w6[0], w5[3], selector); c2[1] = __byte_perm_S (w5[3], w5[2], selector); c2[0] = __byte_perm_S (w5[2], w5[1], selector); c1[3] = __byte_perm_S (w5[1], w5[0], selector); c1[2] = __byte_perm_S (w5[0], w4[3], selector); c1[1] = __byte_perm_S (w4[3], w4[2], selector); c1[0] = __byte_perm_S (w4[2], w4[1], selector); c0[3] = __byte_perm_S (w4[1], w4[0], selector); c0[2] = __byte_perm_S (w4[0], w3[3], selector); c0[1] = __byte_perm_S (w3[3], w3[2], selector); c0[0] = __byte_perm_S (w3[2], w3[1], selector); w7[3] = __byte_perm_S (w3[1], w3[0], selector); w7[2] = __byte_perm_S (w3[0], w2[3], selector); w7[1] = __byte_perm_S (w2[3], w2[2], selector); w7[0] = __byte_perm_S (w2[2], w2[1], selector); w6[3] = __byte_perm_S (w2[1], w2[0], selector); w6[2] = __byte_perm_S (w2[0], w1[3], selector); w6[1] = __byte_perm_S (w1[3], w1[2], selector); w6[0] = __byte_perm_S (w1[2], w1[1], selector); w5[3] = __byte_perm_S (w1[1], w1[0], selector); w5[2] = __byte_perm_S (w1[0], w0[3], selector); w5[1] = __byte_perm_S (w0[3], w0[2], selector); w5[0] = __byte_perm_S (w0[2], w0[1], selector); w4[3] = __byte_perm_S (w0[1], w0[0], selector); w4[2] = __byte_perm_S (w0[0], 0, selector); w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 19: c4[3] = __byte_perm_S ( 0, w7[3], selector); c4[2] = __byte_perm_S (w7[3], w7[2], selector); c4[1] = __byte_perm_S (w7[2], w7[1], selector); c4[0] = __byte_perm_S (w7[1], w7[0], selector); c3[3] = __byte_perm_S (w7[0], w6[3], selector); c3[2] = __byte_perm_S (w6[3], w6[2], selector); c3[1] = __byte_perm_S (w6[2], w6[1], selector); c3[0] = __byte_perm_S (w6[1], w6[0], selector); c2[3] = __byte_perm_S (w6[0], w5[3], selector); c2[2] = __byte_perm_S (w5[3], w5[2], selector); c2[1] = __byte_perm_S (w5[2], w5[1], selector); c2[0] = __byte_perm_S (w5[1], w5[0], selector); c1[3] = __byte_perm_S (w5[0], w4[3], selector); c1[2] = __byte_perm_S (w4[3], w4[2], selector); c1[1] = __byte_perm_S (w4[2], w4[1], selector); c1[0] = __byte_perm_S (w4[1], w4[0], selector); c0[3] = __byte_perm_S (w4[0], w3[3], selector); c0[2] = __byte_perm_S (w3[3], w3[2], selector); c0[1] = __byte_perm_S (w3[2], w3[1], selector); c0[0] = __byte_perm_S (w3[1], w3[0], selector); w7[3] = __byte_perm_S (w3[0], w2[3], selector); w7[2] = __byte_perm_S (w2[3], w2[2], selector); w7[1] = __byte_perm_S (w2[2], w2[1], selector); w7[0] = __byte_perm_S (w2[1], w2[0], selector); w6[3] = __byte_perm_S (w2[0], w1[3], selector); w6[2] = __byte_perm_S (w1[3], w1[2], selector); w6[1] = __byte_perm_S (w1[2], w1[1], selector); w6[0] = __byte_perm_S (w1[1], w1[0], selector); w5[3] = __byte_perm_S (w1[0], w0[3], selector); w5[2] = __byte_perm_S (w0[3], w0[2], selector); w5[1] = __byte_perm_S (w0[2], w0[1], selector); w5[0] = __byte_perm_S (w0[1], w0[0], selector); w4[3] = __byte_perm_S (w0[0], 0, selector); w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 20: c5[0] = __byte_perm_S ( 0, w7[3], selector); c4[3] = __byte_perm_S (w7[3], w7[2], selector); c4[2] = __byte_perm_S (w7[2], w7[1], selector); c4[1] = __byte_perm_S (w7[1], w7[0], selector); c4[0] = __byte_perm_S (w7[0], w6[3], selector); c3[3] = __byte_perm_S (w6[3], w6[2], selector); c3[2] = __byte_perm_S (w6[2], w6[1], selector); c3[1] = __byte_perm_S (w6[1], w6[0], selector); c3[0] = __byte_perm_S (w6[0], w5[3], selector); c2[3] = __byte_perm_S (w5[3], w5[2], selector); c2[2] = __byte_perm_S (w5[2], w5[1], selector); c2[1] = __byte_perm_S (w5[1], w5[0], selector); c2[0] = __byte_perm_S (w5[0], w4[3], selector); c1[3] = __byte_perm_S (w4[3], w4[2], selector); c1[2] = __byte_perm_S (w4[2], w4[1], selector); c1[1] = __byte_perm_S (w4[1], w4[0], selector); c1[0] = __byte_perm_S (w4[0], w3[3], selector); c0[3] = __byte_perm_S (w3[3], w3[2], selector); c0[2] = __byte_perm_S (w3[2], w3[1], selector); c0[1] = __byte_perm_S (w3[1], w3[0], selector); c0[0] = __byte_perm_S (w3[0], w2[3], selector); w7[3] = __byte_perm_S (w2[3], w2[2], selector); w7[2] = __byte_perm_S (w2[2], w2[1], selector); w7[1] = __byte_perm_S (w2[1], w2[0], selector); w7[0] = __byte_perm_S (w2[0], w1[3], selector); w6[3] = __byte_perm_S (w1[3], w1[2], selector); w6[2] = __byte_perm_S (w1[2], w1[1], selector); w6[1] = __byte_perm_S (w1[1], w1[0], selector); w6[0] = __byte_perm_S (w1[0], w0[3], selector); w5[3] = __byte_perm_S (w0[3], w0[2], selector); w5[2] = __byte_perm_S (w0[2], w0[1], selector); w5[1] = __byte_perm_S (w0[1], w0[0], selector); w5[0] = __byte_perm_S (w0[0], 0, selector); w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 21: c5[1] = __byte_perm_S ( 0, w7[3], selector); c5[0] = __byte_perm_S (w7[3], w7[2], selector); c4[3] = __byte_perm_S (w7[2], w7[1], selector); c4[2] = __byte_perm_S (w7[1], w7[0], selector); c4[1] = __byte_perm_S (w7[0], w6[3], selector); c4[0] = __byte_perm_S (w6[3], w6[2], selector); c3[3] = __byte_perm_S (w6[2], w6[1], selector); c3[2] = __byte_perm_S (w6[1], w6[0], selector); c3[1] = __byte_perm_S (w6[0], w5[3], selector); c3[0] = __byte_perm_S (w5[3], w5[2], selector); c2[3] = __byte_perm_S (w5[2], w5[1], selector); c2[2] = __byte_perm_S (w5[1], w5[0], selector); c2[1] = __byte_perm_S (w5[0], w4[3], selector); c2[0] = __byte_perm_S (w4[3], w4[2], selector); c1[3] = __byte_perm_S (w4[2], w4[1], selector); c1[2] = __byte_perm_S (w4[1], w4[0], selector); c1[1] = __byte_perm_S (w4[0], w3[3], selector); c1[0] = __byte_perm_S (w3[3], w3[2], selector); c0[3] = __byte_perm_S (w3[2], w3[1], selector); c0[2] = __byte_perm_S (w3[1], w3[0], selector); c0[1] = __byte_perm_S (w3[0], w2[3], selector); c0[0] = __byte_perm_S (w2[3], w2[2], selector); w7[3] = __byte_perm_S (w2[2], w2[1], selector); w7[2] = __byte_perm_S (w2[1], w2[0], selector); w7[1] = __byte_perm_S (w2[0], w1[3], selector); w7[0] = __byte_perm_S (w1[3], w1[2], selector); w6[3] = __byte_perm_S (w1[2], w1[1], selector); w6[2] = __byte_perm_S (w1[1], w1[0], selector); w6[1] = __byte_perm_S (w1[0], w0[3], selector); w6[0] = __byte_perm_S (w0[3], w0[2], selector); w5[3] = __byte_perm_S (w0[2], w0[1], selector); w5[2] = __byte_perm_S (w0[1], w0[0], selector); w5[1] = __byte_perm_S (w0[0], 0, selector); w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 22: c5[2] = __byte_perm_S ( 0, w7[3], selector); c5[1] = __byte_perm_S (w7[3], w7[2], selector); c5[0] = __byte_perm_S (w7[2], w7[1], selector); c4[3] = __byte_perm_S (w7[1], w7[0], selector); c4[2] = __byte_perm_S (w7[0], w6[3], selector); c4[1] = __byte_perm_S (w6[3], w6[2], selector); c4[0] = __byte_perm_S (w6[2], w6[1], selector); c3[3] = __byte_perm_S (w6[1], w6[0], selector); c3[2] = __byte_perm_S (w6[0], w5[3], selector); c3[1] = __byte_perm_S (w5[3], w5[2], selector); c3[0] = __byte_perm_S (w5[2], w5[1], selector); c2[3] = __byte_perm_S (w5[1], w5[0], selector); c2[2] = __byte_perm_S (w5[0], w4[3], selector); c2[1] = __byte_perm_S (w4[3], w4[2], selector); c2[0] = __byte_perm_S (w4[2], w4[1], selector); c1[3] = __byte_perm_S (w4[1], w4[0], selector); c1[2] = __byte_perm_S (w4[0], w3[3], selector); c1[1] = __byte_perm_S (w3[3], w3[2], selector); c1[0] = __byte_perm_S (w3[2], w3[1], selector); c0[3] = __byte_perm_S (w3[1], w3[0], selector); c0[2] = __byte_perm_S (w3[0], w2[3], selector); c0[1] = __byte_perm_S (w2[3], w2[2], selector); c0[0] = __byte_perm_S (w2[2], w2[1], selector); w7[3] = __byte_perm_S (w2[1], w2[0], selector); w7[2] = __byte_perm_S (w2[0], w1[3], selector); w7[1] = __byte_perm_S (w1[3], w1[2], selector); w7[0] = __byte_perm_S (w1[2], w1[1], selector); w6[3] = __byte_perm_S (w1[1], w1[0], selector); w6[2] = __byte_perm_S (w1[0], w0[3], selector); w6[1] = __byte_perm_S (w0[3], w0[2], selector); w6[0] = __byte_perm_S (w0[2], w0[1], selector); w5[3] = __byte_perm_S (w0[1], w0[0], selector); w5[2] = __byte_perm_S (w0[0], 0, selector); w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 23: c5[3] = __byte_perm_S ( 0, w7[3], selector); c5[2] = __byte_perm_S (w7[3], w7[2], selector); c5[1] = __byte_perm_S (w7[2], w7[1], selector); c5[0] = __byte_perm_S (w7[1], w7[0], selector); c4[3] = __byte_perm_S (w7[0], w6[3], selector); c4[2] = __byte_perm_S (w6[3], w6[2], selector); c4[1] = __byte_perm_S (w6[2], w6[1], selector); c4[0] = __byte_perm_S (w6[1], w6[0], selector); c3[3] = __byte_perm_S (w6[0], w5[3], selector); c3[2] = __byte_perm_S (w5[3], w5[2], selector); c3[1] = __byte_perm_S (w5[2], w5[1], selector); c3[0] = __byte_perm_S (w5[1], w5[0], selector); c2[3] = __byte_perm_S (w5[0], w4[3], selector); c2[2] = __byte_perm_S (w4[3], w4[2], selector); c2[1] = __byte_perm_S (w4[2], w4[1], selector); c2[0] = __byte_perm_S (w4[1], w4[0], selector); c1[3] = __byte_perm_S (w4[0], w3[3], selector); c1[2] = __byte_perm_S (w3[3], w3[2], selector); c1[1] = __byte_perm_S (w3[2], w3[1], selector); c1[0] = __byte_perm_S (w3[1], w3[0], selector); c0[3] = __byte_perm_S (w3[0], w2[3], selector); c0[2] = __byte_perm_S (w2[3], w2[2], selector); c0[1] = __byte_perm_S (w2[2], w2[1], selector); c0[0] = __byte_perm_S (w2[1], w2[0], selector); w7[3] = __byte_perm_S (w2[0], w1[3], selector); w7[2] = __byte_perm_S (w1[3], w1[2], selector); w7[1] = __byte_perm_S (w1[2], w1[1], selector); w7[0] = __byte_perm_S (w1[1], w1[0], selector); w6[3] = __byte_perm_S (w1[0], w0[3], selector); w6[2] = __byte_perm_S (w0[3], w0[2], selector); w6[1] = __byte_perm_S (w0[2], w0[1], selector); w6[0] = __byte_perm_S (w0[1], w0[0], selector); w5[3] = __byte_perm_S (w0[0], 0, selector); w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 24: c6[0] = __byte_perm_S ( 0, w7[3], selector); c5[3] = __byte_perm_S (w7[3], w7[2], selector); c5[2] = __byte_perm_S (w7[2], w7[1], selector); c5[1] = __byte_perm_S (w7[1], w7[0], selector); c5[0] = __byte_perm_S (w7[0], w6[3], selector); c4[3] = __byte_perm_S (w6[3], w6[2], selector); c4[2] = __byte_perm_S (w6[2], w6[1], selector); c4[1] = __byte_perm_S (w6[1], w6[0], selector); c4[0] = __byte_perm_S (w6[0], w5[3], selector); c3[3] = __byte_perm_S (w5[3], w5[2], selector); c3[2] = __byte_perm_S (w5[2], w5[1], selector); c3[1] = __byte_perm_S (w5[1], w5[0], selector); c3[0] = __byte_perm_S (w5[0], w4[3], selector); c2[3] = __byte_perm_S (w4[3], w4[2], selector); c2[2] = __byte_perm_S (w4[2], w4[1], selector); c2[1] = __byte_perm_S (w4[1], w4[0], selector); c2[0] = __byte_perm_S (w4[0], w3[3], selector); c1[3] = __byte_perm_S (w3[3], w3[2], selector); c1[2] = __byte_perm_S (w3[2], w3[1], selector); c1[1] = __byte_perm_S (w3[1], w3[0], selector); c1[0] = __byte_perm_S (w3[0], w2[3], selector); c0[3] = __byte_perm_S (w2[3], w2[2], selector); c0[2] = __byte_perm_S (w2[2], w2[1], selector); c0[1] = __byte_perm_S (w2[1], w2[0], selector); c0[0] = __byte_perm_S (w2[0], w1[3], selector); w7[3] = __byte_perm_S (w1[3], w1[2], selector); w7[2] = __byte_perm_S (w1[2], w1[1], selector); w7[1] = __byte_perm_S (w1[1], w1[0], selector); w7[0] = __byte_perm_S (w1[0], w0[3], selector); w6[3] = __byte_perm_S (w0[3], w0[2], selector); w6[2] = __byte_perm_S (w0[2], w0[1], selector); w6[1] = __byte_perm_S (w0[1], w0[0], selector); w6[0] = __byte_perm_S (w0[0], 0, selector); w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 25: c6[1] = __byte_perm_S ( 0, w7[3], selector); c6[0] = __byte_perm_S (w7[3], w7[2], selector); c5[3] = __byte_perm_S (w7[2], w7[1], selector); c5[2] = __byte_perm_S (w7[1], w7[0], selector); c5[1] = __byte_perm_S (w7[0], w6[3], selector); c5[0] = __byte_perm_S (w6[3], w6[2], selector); c4[3] = __byte_perm_S (w6[2], w6[1], selector); c4[2] = __byte_perm_S (w6[1], w6[0], selector); c4[1] = __byte_perm_S (w6[0], w5[3], selector); c4[0] = __byte_perm_S (w5[3], w5[2], selector); c3[3] = __byte_perm_S (w5[2], w5[1], selector); c3[2] = __byte_perm_S (w5[1], w5[0], selector); c3[1] = __byte_perm_S (w5[0], w4[3], selector); c3[0] = __byte_perm_S (w4[3], w4[2], selector); c2[3] = __byte_perm_S (w4[2], w4[1], selector); c2[2] = __byte_perm_S (w4[1], w4[0], selector); c2[1] = __byte_perm_S (w4[0], w3[3], selector); c2[0] = __byte_perm_S (w3[3], w3[2], selector); c1[3] = __byte_perm_S (w3[2], w3[1], selector); c1[2] = __byte_perm_S (w3[1], w3[0], selector); c1[1] = __byte_perm_S (w3[0], w2[3], selector); c1[0] = __byte_perm_S (w2[3], w2[2], selector); c0[3] = __byte_perm_S (w2[2], w2[1], selector); c0[2] = __byte_perm_S (w2[1], w2[0], selector); c0[1] = __byte_perm_S (w2[0], w1[3], selector); c0[0] = __byte_perm_S (w1[3], w1[2], selector); w7[3] = __byte_perm_S (w1[2], w1[1], selector); w7[2] = __byte_perm_S (w1[1], w1[0], selector); w7[1] = __byte_perm_S (w1[0], w0[3], selector); w7[0] = __byte_perm_S (w0[3], w0[2], selector); w6[3] = __byte_perm_S (w0[2], w0[1], selector); w6[2] = __byte_perm_S (w0[1], w0[0], selector); w6[1] = __byte_perm_S (w0[0], 0, selector); w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 26: c6[2] = __byte_perm_S ( 0, w7[3], selector); c6[1] = __byte_perm_S (w7[3], w7[2], selector); c6[0] = __byte_perm_S (w7[2], w7[1], selector); c5[3] = __byte_perm_S (w7[1], w7[0], selector); c5[2] = __byte_perm_S (w7[0], w6[3], selector); c5[1] = __byte_perm_S (w6[3], w6[2], selector); c5[0] = __byte_perm_S (w6[2], w6[1], selector); c4[3] = __byte_perm_S (w6[1], w6[0], selector); c4[2] = __byte_perm_S (w6[0], w5[3], selector); c4[1] = __byte_perm_S (w5[3], w5[2], selector); c4[0] = __byte_perm_S (w5[2], w5[1], selector); c3[3] = __byte_perm_S (w5[1], w5[0], selector); c3[2] = __byte_perm_S (w5[0], w4[3], selector); c3[1] = __byte_perm_S (w4[3], w4[2], selector); c3[0] = __byte_perm_S (w4[2], w4[1], selector); c2[3] = __byte_perm_S (w4[1], w4[0], selector); c2[2] = __byte_perm_S (w4[0], w3[3], selector); c2[1] = __byte_perm_S (w3[3], w3[2], selector); c2[0] = __byte_perm_S (w3[2], w3[1], selector); c1[3] = __byte_perm_S (w3[1], w3[0], selector); c1[2] = __byte_perm_S (w3[0], w2[3], selector); c1[1] = __byte_perm_S (w2[3], w2[2], selector); c1[0] = __byte_perm_S (w2[2], w2[1], selector); c0[3] = __byte_perm_S (w2[1], w2[0], selector); c0[2] = __byte_perm_S (w2[0], w1[3], selector); c0[1] = __byte_perm_S (w1[3], w1[2], selector); c0[0] = __byte_perm_S (w1[2], w1[1], selector); w7[3] = __byte_perm_S (w1[1], w1[0], selector); w7[2] = __byte_perm_S (w1[0], w0[3], selector); w7[1] = __byte_perm_S (w0[3], w0[2], selector); w7[0] = __byte_perm_S (w0[2], w0[1], selector); w6[3] = __byte_perm_S (w0[1], w0[0], selector); w6[2] = __byte_perm_S (w0[0], 0, selector); w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 27: c6[3] = __byte_perm_S ( 0, w7[3], selector); c6[2] = __byte_perm_S (w7[3], w7[2], selector); c6[1] = __byte_perm_S (w7[2], w7[1], selector); c6[0] = __byte_perm_S (w7[1], w7[0], selector); c5[3] = __byte_perm_S (w7[0], w6[3], selector); c5[2] = __byte_perm_S (w6[3], w6[2], selector); c5[1] = __byte_perm_S (w6[2], w6[1], selector); c5[0] = __byte_perm_S (w6[1], w6[0], selector); c4[3] = __byte_perm_S (w6[0], w5[3], selector); c4[2] = __byte_perm_S (w5[3], w5[2], selector); c4[1] = __byte_perm_S (w5[2], w5[1], selector); c4[0] = __byte_perm_S (w5[1], w5[0], selector); c3[3] = __byte_perm_S (w5[0], w4[3], selector); c3[2] = __byte_perm_S (w4[3], w4[2], selector); c3[1] = __byte_perm_S (w4[2], w4[1], selector); c3[0] = __byte_perm_S (w4[1], w4[0], selector); c2[3] = __byte_perm_S (w4[0], w3[3], selector); c2[2] = __byte_perm_S (w3[3], w3[2], selector); c2[1] = __byte_perm_S (w3[2], w3[1], selector); c2[0] = __byte_perm_S (w3[1], w3[0], selector); c1[3] = __byte_perm_S (w3[0], w2[3], selector); c1[2] = __byte_perm_S (w2[3], w2[2], selector); c1[1] = __byte_perm_S (w2[2], w2[1], selector); c1[0] = __byte_perm_S (w2[1], w2[0], selector); c0[3] = __byte_perm_S (w2[0], w1[3], selector); c0[2] = __byte_perm_S (w1[3], w1[2], selector); c0[1] = __byte_perm_S (w1[2], w1[1], selector); c0[0] = __byte_perm_S (w1[1], w1[0], selector); w7[3] = __byte_perm_S (w1[0], w0[3], selector); w7[2] = __byte_perm_S (w0[3], w0[2], selector); w7[1] = __byte_perm_S (w0[2], w0[1], selector); w7[0] = __byte_perm_S (w0[1], w0[0], selector); w6[3] = __byte_perm_S (w0[0], 0, selector); w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 28: c7[0] = __byte_perm_S ( 0, w7[3], selector); c6[3] = __byte_perm_S (w7[3], w7[2], selector); c6[2] = __byte_perm_S (w7[2], w7[1], selector); c6[1] = __byte_perm_S (w7[1], w7[0], selector); c6[0] = __byte_perm_S (w7[0], w6[3], selector); c5[3] = __byte_perm_S (w6[3], w6[2], selector); c5[2] = __byte_perm_S (w6[2], w6[1], selector); c5[1] = __byte_perm_S (w6[1], w6[0], selector); c5[0] = __byte_perm_S (w6[0], w5[3], selector); c4[3] = __byte_perm_S (w5[3], w5[2], selector); c4[2] = __byte_perm_S (w5[2], w5[1], selector); c4[1] = __byte_perm_S (w5[1], w5[0], selector); c4[0] = __byte_perm_S (w5[0], w4[3], selector); c3[3] = __byte_perm_S (w4[3], w4[2], selector); c3[2] = __byte_perm_S (w4[2], w4[1], selector); c3[1] = __byte_perm_S (w4[1], w4[0], selector); c3[0] = __byte_perm_S (w4[0], w3[3], selector); c2[3] = __byte_perm_S (w3[3], w3[2], selector); c2[2] = __byte_perm_S (w3[2], w3[1], selector); c2[1] = __byte_perm_S (w3[1], w3[0], selector); c2[0] = __byte_perm_S (w3[0], w2[3], selector); c1[3] = __byte_perm_S (w2[3], w2[2], selector); c1[2] = __byte_perm_S (w2[2], w2[1], selector); c1[1] = __byte_perm_S (w2[1], w2[0], selector); c1[0] = __byte_perm_S (w2[0], w1[3], selector); c0[3] = __byte_perm_S (w1[3], w1[2], selector); c0[2] = __byte_perm_S (w1[2], w1[1], selector); c0[1] = __byte_perm_S (w1[1], w1[0], selector); c0[0] = __byte_perm_S (w1[0], w0[3], selector); w7[3] = __byte_perm_S (w0[3], w0[2], selector); w7[2] = __byte_perm_S (w0[2], w0[1], selector); w7[1] = __byte_perm_S (w0[1], w0[0], selector); w7[0] = __byte_perm_S (w0[0], 0, selector); w6[3] = 0; w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 29: c7[1] = __byte_perm_S ( 0, w7[3], selector); c7[0] = __byte_perm_S (w7[3], w7[2], selector); c6[3] = __byte_perm_S (w7[2], w7[1], selector); c6[2] = __byte_perm_S (w7[1], w7[0], selector); c6[1] = __byte_perm_S (w7[0], w6[3], selector); c6[0] = __byte_perm_S (w6[3], w6[2], selector); c5[3] = __byte_perm_S (w6[2], w6[1], selector); c5[2] = __byte_perm_S (w6[1], w6[0], selector); c5[1] = __byte_perm_S (w6[0], w5[3], selector); c5[0] = __byte_perm_S (w5[3], w5[2], selector); c4[3] = __byte_perm_S (w5[2], w5[1], selector); c4[2] = __byte_perm_S (w5[1], w5[0], selector); c4[1] = __byte_perm_S (w5[0], w4[3], selector); c4[0] = __byte_perm_S (w4[3], w4[2], selector); c3[3] = __byte_perm_S (w4[2], w4[1], selector); c3[2] = __byte_perm_S (w4[1], w4[0], selector); c3[1] = __byte_perm_S (w4[0], w3[3], selector); c3[0] = __byte_perm_S (w3[3], w3[2], selector); c2[3] = __byte_perm_S (w3[2], w3[1], selector); c2[2] = __byte_perm_S (w3[1], w3[0], selector); c2[1] = __byte_perm_S (w3[0], w2[3], selector); c2[0] = __byte_perm_S (w2[3], w2[2], selector); c1[3] = __byte_perm_S (w2[2], w2[1], selector); c1[2] = __byte_perm_S (w2[1], w2[0], selector); c1[1] = __byte_perm_S (w2[0], w1[3], selector); c1[0] = __byte_perm_S (w1[3], w1[2], selector); c0[3] = __byte_perm_S (w1[2], w1[1], selector); c0[2] = __byte_perm_S (w1[1], w1[0], selector); c0[1] = __byte_perm_S (w1[0], w0[3], selector); c0[0] = __byte_perm_S (w0[3], w0[2], selector); w7[3] = __byte_perm_S (w0[2], w0[1], selector); w7[2] = __byte_perm_S (w0[1], w0[0], selector); w7[1] = __byte_perm_S (w0[0], 0, selector); w7[0] = 0; w6[3] = 0; w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 30: c7[2] = __byte_perm_S ( 0, w7[3], selector); c7[1] = __byte_perm_S (w7[3], w7[2], selector); c7[0] = __byte_perm_S (w7[2], w7[1], selector); c6[3] = __byte_perm_S (w7[1], w7[0], selector); c6[2] = __byte_perm_S (w7[0], w6[3], selector); c6[1] = __byte_perm_S (w6[3], w6[2], selector); c6[0] = __byte_perm_S (w6[2], w6[1], selector); c5[3] = __byte_perm_S (w6[1], w6[0], selector); c5[2] = __byte_perm_S (w6[0], w5[3], selector); c5[1] = __byte_perm_S (w5[3], w5[2], selector); c5[0] = __byte_perm_S (w5[2], w5[1], selector); c4[3] = __byte_perm_S (w5[1], w5[0], selector); c4[2] = __byte_perm_S (w5[0], w4[3], selector); c4[1] = __byte_perm_S (w4[3], w4[2], selector); c4[0] = __byte_perm_S (w4[2], w4[1], selector); c3[3] = __byte_perm_S (w4[1], w4[0], selector); c3[2] = __byte_perm_S (w4[0], w3[3], selector); c3[1] = __byte_perm_S (w3[3], w3[2], selector); c3[0] = __byte_perm_S (w3[2], w3[1], selector); c2[3] = __byte_perm_S (w3[1], w3[0], selector); c2[2] = __byte_perm_S (w3[0], w2[3], selector); c2[1] = __byte_perm_S (w2[3], w2[2], selector); c2[0] = __byte_perm_S (w2[2], w2[1], selector); c1[3] = __byte_perm_S (w2[1], w2[0], selector); c1[2] = __byte_perm_S (w2[0], w1[3], selector); c1[1] = __byte_perm_S (w1[3], w1[2], selector); c1[0] = __byte_perm_S (w1[2], w1[1], selector); c0[3] = __byte_perm_S (w1[1], w1[0], selector); c0[2] = __byte_perm_S (w1[0], w0[3], selector); c0[1] = __byte_perm_S (w0[3], w0[2], selector); c0[0] = __byte_perm_S (w0[2], w0[1], selector); w7[3] = __byte_perm_S (w0[1], w0[0], selector); w7[2] = __byte_perm_S (w0[0], 0, selector); w7[1] = 0; w7[0] = 0; w6[3] = 0; w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; case 31: c7[3] = __byte_perm_S ( 0, w7[3], selector); c7[2] = __byte_perm_S (w7[3], w7[2], selector); c7[1] = __byte_perm_S (w7[2], w7[1], selector); c7[0] = __byte_perm_S (w7[1], w7[0], selector); c6[3] = __byte_perm_S (w7[0], w6[3], selector); c6[2] = __byte_perm_S (w6[3], w6[2], selector); c6[1] = __byte_perm_S (w6[2], w6[1], selector); c6[0] = __byte_perm_S (w6[1], w6[0], selector); c5[3] = __byte_perm_S (w6[0], w5[3], selector); c5[2] = __byte_perm_S (w5[3], w5[2], selector); c5[1] = __byte_perm_S (w5[2], w5[1], selector); c5[0] = __byte_perm_S (w5[1], w5[0], selector); c4[3] = __byte_perm_S (w5[0], w4[3], selector); c4[2] = __byte_perm_S (w4[3], w4[2], selector); c4[1] = __byte_perm_S (w4[2], w4[1], selector); c4[0] = __byte_perm_S (w4[1], w4[0], selector); c3[3] = __byte_perm_S (w4[0], w3[3], selector); c3[2] = __byte_perm_S (w3[3], w3[2], selector); c3[1] = __byte_perm_S (w3[2], w3[1], selector); c3[0] = __byte_perm_S (w3[1], w3[0], selector); c2[3] = __byte_perm_S (w3[0], w2[3], selector); c2[2] = __byte_perm_S (w2[3], w2[2], selector); c2[1] = __byte_perm_S (w2[2], w2[1], selector); c2[0] = __byte_perm_S (w2[1], w2[0], selector); c1[3] = __byte_perm_S (w2[0], w1[3], selector); c1[2] = __byte_perm_S (w1[3], w1[2], selector); c1[1] = __byte_perm_S (w1[2], w1[1], selector); c1[0] = __byte_perm_S (w1[1], w1[0], selector); c0[3] = __byte_perm_S (w1[0], w0[3], selector); c0[2] = __byte_perm_S (w0[3], w0[2], selector); c0[1] = __byte_perm_S (w0[2], w0[1], selector); c0[0] = __byte_perm_S (w0[1], w0[0], selector); w7[3] = __byte_perm_S (w0[0], 0, selector); w7[2] = 0; w7[1] = 0; w7[0] = 0; w6[3] = 0; w6[2] = 0; w6[1] = 0; w6[0] = 0; w5[3] = 0; w5[2] = 0; w5[1] = 0; w5[0] = 0; w4[3] = 0; w4[2] = 0; w4[1] = 0; w4[0] = 0; w3[3] = 0; w3[2] = 0; w3[1] = 0; w3[0] = 0; w2[3] = 0; w2[2] = 0; w2[1] = 0; w2[0] = 0; w1[3] = 0; w1[2] = 0; w1[1] = 0; w1[0] = 0; w0[3] = 0; w0[2] = 0; w0[1] = 0; w0[0] = 0; break; } #endif } DECLSPEC void switch_buffer_by_offset_1x64_le_S (u32 w[64], const u32 offset) { const int offset_mod_4 = offset & 3; const int offset_minus_4 = 4 - offset_mod_4; const int offset_switch = offset / 4; #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC #pragma unroll for (int i = 0; i < 64; i++) w[i] = swap32_S (w[i]); switch (offset_switch) { case 0: w[63] = amd_bytealign_S (w[62], w[63], offset); w[62] = amd_bytealign_S (w[61], w[62], offset); w[61] = amd_bytealign_S (w[60], w[61], offset); w[60] = amd_bytealign_S (w[59], w[60], offset); w[59] = amd_bytealign_S (w[58], w[59], offset); w[58] = amd_bytealign_S (w[57], w[58], offset); w[57] = amd_bytealign_S (w[56], w[57], offset); w[56] = amd_bytealign_S (w[55], w[56], offset); w[55] = amd_bytealign_S (w[54], w[55], offset); w[54] = amd_bytealign_S (w[53], w[54], offset); w[53] = amd_bytealign_S (w[52], w[53], offset); w[52] = amd_bytealign_S (w[51], w[52], offset); w[51] = amd_bytealign_S (w[50], w[51], offset); w[50] = amd_bytealign_S (w[49], w[50], offset); w[49] = amd_bytealign_S (w[48], w[49], offset); w[48] = amd_bytealign_S (w[47], w[48], offset); w[47] = amd_bytealign_S (w[46], w[47], offset); w[46] = amd_bytealign_S (w[45], w[46], offset); w[45] = amd_bytealign_S (w[44], w[45], offset); w[44] = amd_bytealign_S (w[43], w[44], offset); w[43] = amd_bytealign_S (w[42], w[43], offset); w[42] = amd_bytealign_S (w[41], w[42], offset); w[41] = amd_bytealign_S (w[40], w[41], offset); w[40] = amd_bytealign_S (w[39], w[40], offset); w[39] = amd_bytealign_S (w[38], w[39], offset); w[38] = amd_bytealign_S (w[37], w[38], offset); w[37] = amd_bytealign_S (w[36], w[37], offset); w[36] = amd_bytealign_S (w[35], w[36], offset); w[35] = amd_bytealign_S (w[34], w[35], offset); w[34] = amd_bytealign_S (w[33], w[34], offset); w[33] = amd_bytealign_S (w[32], w[33], offset); w[32] = amd_bytealign_S (w[31], w[32], offset); w[31] = amd_bytealign_S (w[30], w[31], offset); w[30] = amd_bytealign_S (w[29], w[30], offset); w[29] = amd_bytealign_S (w[28], w[29], offset); w[28] = amd_bytealign_S (w[27], w[28], offset); w[27] = amd_bytealign_S (w[26], w[27], offset); w[26] = amd_bytealign_S (w[25], w[26], offset); w[25] = amd_bytealign_S (w[24], w[25], offset); w[24] = amd_bytealign_S (w[23], w[24], offset); w[23] = amd_bytealign_S (w[22], w[23], offset); w[22] = amd_bytealign_S (w[21], w[22], offset); w[21] = amd_bytealign_S (w[20], w[21], offset); w[20] = amd_bytealign_S (w[19], w[20], offset); w[19] = amd_bytealign_S (w[18], w[19], offset); w[18] = amd_bytealign_S (w[17], w[18], offset); w[17] = amd_bytealign_S (w[16], w[17], offset); w[16] = amd_bytealign_S (w[15], w[16], offset); w[15] = amd_bytealign_S (w[14], w[15], offset); w[14] = amd_bytealign_S (w[13], w[14], offset); w[13] = amd_bytealign_S (w[12], w[13], offset); w[12] = amd_bytealign_S (w[11], w[12], offset); w[11] = amd_bytealign_S (w[10], w[11], offset); w[10] = amd_bytealign_S (w[ 9], w[10], offset); w[ 9] = amd_bytealign_S (w[ 8], w[ 9], offset); w[ 8] = amd_bytealign_S (w[ 7], w[ 8], offset); w[ 7] = amd_bytealign_S (w[ 6], w[ 7], offset); w[ 6] = amd_bytealign_S (w[ 5], w[ 6], offset); w[ 5] = amd_bytealign_S (w[ 4], w[ 5], offset); w[ 4] = amd_bytealign_S (w[ 3], w[ 4], offset); w[ 3] = amd_bytealign_S (w[ 2], w[ 3], offset); w[ 2] = amd_bytealign_S (w[ 1], w[ 2], offset); w[ 1] = amd_bytealign_S (w[ 0], w[ 1], offset); w[ 0] = amd_bytealign_S ( 0, w[ 0], offset); break; case 1: w[63] = amd_bytealign_S (w[61], w[62], offset); w[62] = amd_bytealign_S (w[60], w[61], offset); w[61] = amd_bytealign_S (w[59], w[60], offset); w[60] = amd_bytealign_S (w[58], w[59], offset); w[59] = amd_bytealign_S (w[57], w[58], offset); w[58] = amd_bytealign_S (w[56], w[57], offset); w[57] = amd_bytealign_S (w[55], w[56], offset); w[56] = amd_bytealign_S (w[54], w[55], offset); w[55] = amd_bytealign_S (w[53], w[54], offset); w[54] = amd_bytealign_S (w[52], w[53], offset); w[53] = amd_bytealign_S (w[51], w[52], offset); w[52] = amd_bytealign_S (w[50], w[51], offset); w[51] = amd_bytealign_S (w[49], w[50], offset); w[50] = amd_bytealign_S (w[48], w[49], offset); w[49] = amd_bytealign_S (w[47], w[48], offset); w[48] = amd_bytealign_S (w[46], w[47], offset); w[47] = amd_bytealign_S (w[45], w[46], offset); w[46] = amd_bytealign_S (w[44], w[45], offset); w[45] = amd_bytealign_S (w[43], w[44], offset); w[44] = amd_bytealign_S (w[42], w[43], offset); w[43] = amd_bytealign_S (w[41], w[42], offset); w[42] = amd_bytealign_S (w[40], w[41], offset); w[41] = amd_bytealign_S (w[39], w[40], offset); w[40] = amd_bytealign_S (w[38], w[39], offset); w[39] = amd_bytealign_S (w[37], w[38], offset); w[38] = amd_bytealign_S (w[36], w[37], offset); w[37] = amd_bytealign_S (w[35], w[36], offset); w[36] = amd_bytealign_S (w[34], w[35], offset); w[35] = amd_bytealign_S (w[33], w[34], offset); w[34] = amd_bytealign_S (w[32], w[33], offset); w[33] = amd_bytealign_S (w[31], w[32], offset); w[32] = amd_bytealign_S (w[30], w[31], offset); w[31] = amd_bytealign_S (w[29], w[30], offset); w[30] = amd_bytealign_S (w[28], w[29], offset); w[29] = amd_bytealign_S (w[27], w[28], offset); w[28] = amd_bytealign_S (w[26], w[27], offset); w[27] = amd_bytealign_S (w[25], w[26], offset); w[26] = amd_bytealign_S (w[24], w[25], offset); w[25] = amd_bytealign_S (w[23], w[24], offset); w[24] = amd_bytealign_S (w[22], w[23], offset); w[23] = amd_bytealign_S (w[21], w[22], offset); w[22] = amd_bytealign_S (w[20], w[21], offset); w[21] = amd_bytealign_S (w[19], w[20], offset); w[20] = amd_bytealign_S (w[18], w[19], offset); w[19] = amd_bytealign_S (w[17], w[18], offset); w[18] = amd_bytealign_S (w[16], w[17], offset); w[17] = amd_bytealign_S (w[15], w[16], offset); w[16] = amd_bytealign_S (w[14], w[15], offset); w[15] = amd_bytealign_S (w[13], w[14], offset); w[14] = amd_bytealign_S (w[12], w[13], offset); w[13] = amd_bytealign_S (w[11], w[12], offset); w[12] = amd_bytealign_S (w[10], w[11], offset); w[11] = amd_bytealign_S (w[ 9], w[10], offset); w[10] = amd_bytealign_S (w[ 8], w[ 9], offset); w[ 9] = amd_bytealign_S (w[ 7], w[ 8], offset); w[ 8] = amd_bytealign_S (w[ 6], w[ 7], offset); w[ 7] = amd_bytealign_S (w[ 5], w[ 6], offset); w[ 6] = amd_bytealign_S (w[ 4], w[ 5], offset); w[ 5] = amd_bytealign_S (w[ 3], w[ 4], offset); w[ 4] = amd_bytealign_S (w[ 2], w[ 3], offset); w[ 3] = amd_bytealign_S (w[ 1], w[ 2], offset); w[ 2] = amd_bytealign_S (w[ 0], w[ 1], offset); w[ 1] = amd_bytealign_S ( 0, w[ 0], offset); w[ 0] = 0; break; case 2: w[63] = amd_bytealign_S (w[60], w[61], offset); w[62] = amd_bytealign_S (w[59], w[60], offset); w[61] = amd_bytealign_S (w[58], w[59], offset); w[60] = amd_bytealign_S (w[57], w[58], offset); w[59] = amd_bytealign_S (w[56], w[57], offset); w[58] = amd_bytealign_S (w[55], w[56], offset); w[57] = amd_bytealign_S (w[54], w[55], offset); w[56] = amd_bytealign_S (w[53], w[54], offset); w[55] = amd_bytealign_S (w[52], w[53], offset); w[54] = amd_bytealign_S (w[51], w[52], offset); w[53] = amd_bytealign_S (w[50], w[51], offset); w[52] = amd_bytealign_S (w[49], w[50], offset); w[51] = amd_bytealign_S (w[48], w[49], offset); w[50] = amd_bytealign_S (w[47], w[48], offset); w[49] = amd_bytealign_S (w[46], w[47], offset); w[48] = amd_bytealign_S (w[45], w[46], offset); w[47] = amd_bytealign_S (w[44], w[45], offset); w[46] = amd_bytealign_S (w[43], w[44], offset); w[45] = amd_bytealign_S (w[42], w[43], offset); w[44] = amd_bytealign_S (w[41], w[42], offset); w[43] = amd_bytealign_S (w[40], w[41], offset); w[42] = amd_bytealign_S (w[39], w[40], offset); w[41] = amd_bytealign_S (w[38], w[39], offset); w[40] = amd_bytealign_S (w[37], w[38], offset); w[39] = amd_bytealign_S (w[36], w[37], offset); w[38] = amd_bytealign_S (w[35], w[36], offset); w[37] = amd_bytealign_S (w[34], w[35], offset); w[36] = amd_bytealign_S (w[33], w[34], offset); w[35] = amd_bytealign_S (w[32], w[33], offset); w[34] = amd_bytealign_S (w[31], w[32], offset); w[33] = amd_bytealign_S (w[30], w[31], offset); w[32] = amd_bytealign_S (w[29], w[30], offset); w[31] = amd_bytealign_S (w[28], w[29], offset); w[30] = amd_bytealign_S (w[27], w[28], offset); w[29] = amd_bytealign_S (w[26], w[27], offset); w[28] = amd_bytealign_S (w[25], w[26], offset); w[27] = amd_bytealign_S (w[24], w[25], offset); w[26] = amd_bytealign_S (w[23], w[24], offset); w[25] = amd_bytealign_S (w[22], w[23], offset); w[24] = amd_bytealign_S (w[21], w[22], offset); w[23] = amd_bytealign_S (w[20], w[21], offset); w[22] = amd_bytealign_S (w[19], w[20], offset); w[21] = amd_bytealign_S (w[18], w[19], offset); w[20] = amd_bytealign_S (w[17], w[18], offset); w[19] = amd_bytealign_S (w[16], w[17], offset); w[18] = amd_bytealign_S (w[15], w[16], offset); w[17] = amd_bytealign_S (w[14], w[15], offset); w[16] = amd_bytealign_S (w[13], w[14], offset); w[15] = amd_bytealign_S (w[12], w[13], offset); w[14] = amd_bytealign_S (w[11], w[12], offset); w[13] = amd_bytealign_S (w[10], w[11], offset); w[12] = amd_bytealign_S (w[ 9], w[10], offset); w[11] = amd_bytealign_S (w[ 8], w[ 9], offset); w[10] = amd_bytealign_S (w[ 7], w[ 8], offset); w[ 9] = amd_bytealign_S (w[ 6], w[ 7], offset); w[ 8] = amd_bytealign_S (w[ 5], w[ 6], offset); w[ 7] = amd_bytealign_S (w[ 4], w[ 5], offset); w[ 6] = amd_bytealign_S (w[ 3], w[ 4], offset); w[ 5] = amd_bytealign_S (w[ 2], w[ 3], offset); w[ 4] = amd_bytealign_S (w[ 1], w[ 2], offset); w[ 3] = amd_bytealign_S (w[ 0], w[ 1], offset); w[ 2] = amd_bytealign_S ( 0, w[ 0], offset); w[ 1] = 0; w[ 0] = 0; break; case 3: w[63] = amd_bytealign_S (w[59], w[60], offset); w[62] = amd_bytealign_S (w[58], w[59], offset); w[61] = amd_bytealign_S (w[57], w[58], offset); w[60] = amd_bytealign_S (w[56], w[57], offset); w[59] = amd_bytealign_S (w[55], w[56], offset); w[58] = amd_bytealign_S (w[54], w[55], offset); w[57] = amd_bytealign_S (w[53], w[54], offset); w[56] = amd_bytealign_S (w[52], w[53], offset); w[55] = amd_bytealign_S (w[51], w[52], offset); w[54] = amd_bytealign_S (w[50], w[51], offset); w[53] = amd_bytealign_S (w[49], w[50], offset); w[52] = amd_bytealign_S (w[48], w[49], offset); w[51] = amd_bytealign_S (w[47], w[48], offset); w[50] = amd_bytealign_S (w[46], w[47], offset); w[49] = amd_bytealign_S (w[45], w[46], offset); w[48] = amd_bytealign_S (w[44], w[45], offset); w[47] = amd_bytealign_S (w[43], w[44], offset); w[46] = amd_bytealign_S (w[42], w[43], offset); w[45] = amd_bytealign_S (w[41], w[42], offset); w[44] = amd_bytealign_S (w[40], w[41], offset); w[43] = amd_bytealign_S (w[39], w[40], offset); w[42] = amd_bytealign_S (w[38], w[39], offset); w[41] = amd_bytealign_S (w[37], w[38], offset); w[40] = amd_bytealign_S (w[36], w[37], offset); w[39] = amd_bytealign_S (w[35], w[36], offset); w[38] = amd_bytealign_S (w[34], w[35], offset); w[37] = amd_bytealign_S (w[33], w[34], offset); w[36] = amd_bytealign_S (w[32], w[33], offset); w[35] = amd_bytealign_S (w[31], w[32], offset); w[34] = amd_bytealign_S (w[30], w[31], offset); w[33] = amd_bytealign_S (w[29], w[30], offset); w[32] = amd_bytealign_S (w[28], w[29], offset); w[31] = amd_bytealign_S (w[27], w[28], offset); w[30] = amd_bytealign_S (w[26], w[27], offset); w[29] = amd_bytealign_S (w[25], w[26], offset); w[28] = amd_bytealign_S (w[24], w[25], offset); w[27] = amd_bytealign_S (w[23], w[24], offset); w[26] = amd_bytealign_S (w[22], w[23], offset); w[25] = amd_bytealign_S (w[21], w[22], offset); w[24] = amd_bytealign_S (w[20], w[21], offset); w[23] = amd_bytealign_S (w[19], w[20], offset); w[22] = amd_bytealign_S (w[18], w[19], offset); w[21] = amd_bytealign_S (w[17], w[18], offset); w[20] = amd_bytealign_S (w[16], w[17], offset); w[19] = amd_bytealign_S (w[15], w[16], offset); w[18] = amd_bytealign_S (w[14], w[15], offset); w[17] = amd_bytealign_S (w[13], w[14], offset); w[16] = amd_bytealign_S (w[12], w[13], offset); w[15] = amd_bytealign_S (w[11], w[12], offset); w[14] = amd_bytealign_S (w[10], w[11], offset); w[13] = amd_bytealign_S (w[ 9], w[10], offset); w[12] = amd_bytealign_S (w[ 8], w[ 9], offset); w[11] = amd_bytealign_S (w[ 7], w[ 8], offset); w[10] = amd_bytealign_S (w[ 6], w[ 7], offset); w[ 9] = amd_bytealign_S (w[ 5], w[ 6], offset); w[ 8] = amd_bytealign_S (w[ 4], w[ 5], offset); w[ 7] = amd_bytealign_S (w[ 3], w[ 4], offset); w[ 6] = amd_bytealign_S (w[ 2], w[ 3], offset); w[ 5] = amd_bytealign_S (w[ 1], w[ 2], offset); w[ 4] = amd_bytealign_S (w[ 0], w[ 1], offset); w[ 3] = amd_bytealign_S ( 0, w[ 0], offset); w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 4: w[63] = amd_bytealign_S (w[58], w[59], offset); w[62] = amd_bytealign_S (w[57], w[58], offset); w[61] = amd_bytealign_S (w[56], w[57], offset); w[60] = amd_bytealign_S (w[55], w[56], offset); w[59] = amd_bytealign_S (w[54], w[55], offset); w[58] = amd_bytealign_S (w[53], w[54], offset); w[57] = amd_bytealign_S (w[52], w[53], offset); w[56] = amd_bytealign_S (w[51], w[52], offset); w[55] = amd_bytealign_S (w[50], w[51], offset); w[54] = amd_bytealign_S (w[49], w[50], offset); w[53] = amd_bytealign_S (w[48], w[49], offset); w[52] = amd_bytealign_S (w[47], w[48], offset); w[51] = amd_bytealign_S (w[46], w[47], offset); w[50] = amd_bytealign_S (w[45], w[46], offset); w[49] = amd_bytealign_S (w[44], w[45], offset); w[48] = amd_bytealign_S (w[43], w[44], offset); w[47] = amd_bytealign_S (w[42], w[43], offset); w[46] = amd_bytealign_S (w[41], w[42], offset); w[45] = amd_bytealign_S (w[40], w[41], offset); w[44] = amd_bytealign_S (w[39], w[40], offset); w[43] = amd_bytealign_S (w[38], w[39], offset); w[42] = amd_bytealign_S (w[37], w[38], offset); w[41] = amd_bytealign_S (w[36], w[37], offset); w[40] = amd_bytealign_S (w[35], w[36], offset); w[39] = amd_bytealign_S (w[34], w[35], offset); w[38] = amd_bytealign_S (w[33], w[34], offset); w[37] = amd_bytealign_S (w[32], w[33], offset); w[36] = amd_bytealign_S (w[31], w[32], offset); w[35] = amd_bytealign_S (w[30], w[31], offset); w[34] = amd_bytealign_S (w[29], w[30], offset); w[33] = amd_bytealign_S (w[28], w[29], offset); w[32] = amd_bytealign_S (w[27], w[28], offset); w[31] = amd_bytealign_S (w[26], w[27], offset); w[30] = amd_bytealign_S (w[25], w[26], offset); w[29] = amd_bytealign_S (w[24], w[25], offset); w[28] = amd_bytealign_S (w[23], w[24], offset); w[27] = amd_bytealign_S (w[22], w[23], offset); w[26] = amd_bytealign_S (w[21], w[22], offset); w[25] = amd_bytealign_S (w[20], w[21], offset); w[24] = amd_bytealign_S (w[19], w[20], offset); w[23] = amd_bytealign_S (w[18], w[19], offset); w[22] = amd_bytealign_S (w[17], w[18], offset); w[21] = amd_bytealign_S (w[16], w[17], offset); w[20] = amd_bytealign_S (w[15], w[16], offset); w[19] = amd_bytealign_S (w[14], w[15], offset); w[18] = amd_bytealign_S (w[13], w[14], offset); w[17] = amd_bytealign_S (w[12], w[13], offset); w[16] = amd_bytealign_S (w[11], w[12], offset); w[15] = amd_bytealign_S (w[10], w[11], offset); w[14] = amd_bytealign_S (w[ 9], w[10], offset); w[13] = amd_bytealign_S (w[ 8], w[ 9], offset); w[12] = amd_bytealign_S (w[ 7], w[ 8], offset); w[11] = amd_bytealign_S (w[ 6], w[ 7], offset); w[10] = amd_bytealign_S (w[ 5], w[ 6], offset); w[ 9] = amd_bytealign_S (w[ 4], w[ 5], offset); w[ 8] = amd_bytealign_S (w[ 3], w[ 4], offset); w[ 7] = amd_bytealign_S (w[ 2], w[ 3], offset); w[ 6] = amd_bytealign_S (w[ 1], w[ 2], offset); w[ 5] = amd_bytealign_S (w[ 0], w[ 1], offset); w[ 4] = amd_bytealign_S ( 0, w[ 0], offset); w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 5: w[63] = amd_bytealign_S (w[57], w[58], offset); w[62] = amd_bytealign_S (w[56], w[57], offset); w[61] = amd_bytealign_S (w[55], w[56], offset); w[60] = amd_bytealign_S (w[54], w[55], offset); w[59] = amd_bytealign_S (w[53], w[54], offset); w[58] = amd_bytealign_S (w[52], w[53], offset); w[57] = amd_bytealign_S (w[51], w[52], offset); w[56] = amd_bytealign_S (w[50], w[51], offset); w[55] = amd_bytealign_S (w[49], w[50], offset); w[54] = amd_bytealign_S (w[48], w[49], offset); w[53] = amd_bytealign_S (w[47], w[48], offset); w[52] = amd_bytealign_S (w[46], w[47], offset); w[51] = amd_bytealign_S (w[45], w[46], offset); w[50] = amd_bytealign_S (w[44], w[45], offset); w[49] = amd_bytealign_S (w[43], w[44], offset); w[48] = amd_bytealign_S (w[42], w[43], offset); w[47] = amd_bytealign_S (w[41], w[42], offset); w[46] = amd_bytealign_S (w[40], w[41], offset); w[45] = amd_bytealign_S (w[39], w[40], offset); w[44] = amd_bytealign_S (w[38], w[39], offset); w[43] = amd_bytealign_S (w[37], w[38], offset); w[42] = amd_bytealign_S (w[36], w[37], offset); w[41] = amd_bytealign_S (w[35], w[36], offset); w[40] = amd_bytealign_S (w[34], w[35], offset); w[39] = amd_bytealign_S (w[33], w[34], offset); w[38] = amd_bytealign_S (w[32], w[33], offset); w[37] = amd_bytealign_S (w[31], w[32], offset); w[36] = amd_bytealign_S (w[30], w[31], offset); w[35] = amd_bytealign_S (w[29], w[30], offset); w[34] = amd_bytealign_S (w[28], w[29], offset); w[33] = amd_bytealign_S (w[27], w[28], offset); w[32] = amd_bytealign_S (w[26], w[27], offset); w[31] = amd_bytealign_S (w[25], w[26], offset); w[30] = amd_bytealign_S (w[24], w[25], offset); w[29] = amd_bytealign_S (w[23], w[24], offset); w[28] = amd_bytealign_S (w[22], w[23], offset); w[27] = amd_bytealign_S (w[21], w[22], offset); w[26] = amd_bytealign_S (w[20], w[21], offset); w[25] = amd_bytealign_S (w[19], w[20], offset); w[24] = amd_bytealign_S (w[18], w[19], offset); w[23] = amd_bytealign_S (w[17], w[18], offset); w[22] = amd_bytealign_S (w[16], w[17], offset); w[21] = amd_bytealign_S (w[15], w[16], offset); w[20] = amd_bytealign_S (w[14], w[15], offset); w[19] = amd_bytealign_S (w[13], w[14], offset); w[18] = amd_bytealign_S (w[12], w[13], offset); w[17] = amd_bytealign_S (w[11], w[12], offset); w[16] = amd_bytealign_S (w[10], w[11], offset); w[15] = amd_bytealign_S (w[ 9], w[10], offset); w[14] = amd_bytealign_S (w[ 8], w[ 9], offset); w[13] = amd_bytealign_S (w[ 7], w[ 8], offset); w[12] = amd_bytealign_S (w[ 6], w[ 7], offset); w[11] = amd_bytealign_S (w[ 5], w[ 6], offset); w[10] = amd_bytealign_S (w[ 4], w[ 5], offset); w[ 9] = amd_bytealign_S (w[ 3], w[ 4], offset); w[ 8] = amd_bytealign_S (w[ 2], w[ 3], offset); w[ 7] = amd_bytealign_S (w[ 1], w[ 2], offset); w[ 6] = amd_bytealign_S (w[ 0], w[ 1], offset); w[ 5] = amd_bytealign_S ( 0, w[ 0], offset); w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 6: w[63] = amd_bytealign_S (w[56], w[57], offset); w[62] = amd_bytealign_S (w[55], w[56], offset); w[61] = amd_bytealign_S (w[54], w[55], offset); w[60] = amd_bytealign_S (w[53], w[54], offset); w[59] = amd_bytealign_S (w[52], w[53], offset); w[58] = amd_bytealign_S (w[51], w[52], offset); w[57] = amd_bytealign_S (w[50], w[51], offset); w[56] = amd_bytealign_S (w[49], w[50], offset); w[55] = amd_bytealign_S (w[48], w[49], offset); w[54] = amd_bytealign_S (w[47], w[48], offset); w[53] = amd_bytealign_S (w[46], w[47], offset); w[52] = amd_bytealign_S (w[45], w[46], offset); w[51] = amd_bytealign_S (w[44], w[45], offset); w[50] = amd_bytealign_S (w[43], w[44], offset); w[49] = amd_bytealign_S (w[42], w[43], offset); w[48] = amd_bytealign_S (w[41], w[42], offset); w[47] = amd_bytealign_S (w[40], w[41], offset); w[46] = amd_bytealign_S (w[39], w[40], offset); w[45] = amd_bytealign_S (w[38], w[39], offset); w[44] = amd_bytealign_S (w[37], w[38], offset); w[43] = amd_bytealign_S (w[36], w[37], offset); w[42] = amd_bytealign_S (w[35], w[36], offset); w[41] = amd_bytealign_S (w[34], w[35], offset); w[40] = amd_bytealign_S (w[33], w[34], offset); w[39] = amd_bytealign_S (w[32], w[33], offset); w[38] = amd_bytealign_S (w[31], w[32], offset); w[37] = amd_bytealign_S (w[30], w[31], offset); w[36] = amd_bytealign_S (w[29], w[30], offset); w[35] = amd_bytealign_S (w[28], w[29], offset); w[34] = amd_bytealign_S (w[27], w[28], offset); w[33] = amd_bytealign_S (w[26], w[27], offset); w[32] = amd_bytealign_S (w[25], w[26], offset); w[31] = amd_bytealign_S (w[24], w[25], offset); w[30] = amd_bytealign_S (w[23], w[24], offset); w[29] = amd_bytealign_S (w[22], w[23], offset); w[28] = amd_bytealign_S (w[21], w[22], offset); w[27] = amd_bytealign_S (w[20], w[21], offset); w[26] = amd_bytealign_S (w[19], w[20], offset); w[25] = amd_bytealign_S (w[18], w[19], offset); w[24] = amd_bytealign_S (w[17], w[18], offset); w[23] = amd_bytealign_S (w[16], w[17], offset); w[22] = amd_bytealign_S (w[15], w[16], offset); w[21] = amd_bytealign_S (w[14], w[15], offset); w[20] = amd_bytealign_S (w[13], w[14], offset); w[19] = amd_bytealign_S (w[12], w[13], offset); w[18] = amd_bytealign_S (w[11], w[12], offset); w[17] = amd_bytealign_S (w[10], w[11], offset); w[16] = amd_bytealign_S (w[ 9], w[10], offset); w[15] = amd_bytealign_S (w[ 8], w[ 9], offset); w[14] = amd_bytealign_S (w[ 7], w[ 8], offset); w[13] = amd_bytealign_S (w[ 6], w[ 7], offset); w[12] = amd_bytealign_S (w[ 5], w[ 6], offset); w[11] = amd_bytealign_S (w[ 4], w[ 5], offset); w[10] = amd_bytealign_S (w[ 3], w[ 4], offset); w[ 9] = amd_bytealign_S (w[ 2], w[ 3], offset); w[ 8] = amd_bytealign_S (w[ 1], w[ 2], offset); w[ 7] = amd_bytealign_S (w[ 0], w[ 1], offset); w[ 6] = amd_bytealign_S ( 0, w[ 0], offset); w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 7: w[63] = amd_bytealign_S (w[55], w[56], offset); w[62] = amd_bytealign_S (w[54], w[55], offset); w[61] = amd_bytealign_S (w[53], w[54], offset); w[60] = amd_bytealign_S (w[52], w[53], offset); w[59] = amd_bytealign_S (w[51], w[52], offset); w[58] = amd_bytealign_S (w[50], w[51], offset); w[57] = amd_bytealign_S (w[49], w[50], offset); w[56] = amd_bytealign_S (w[48], w[49], offset); w[55] = amd_bytealign_S (w[47], w[48], offset); w[54] = amd_bytealign_S (w[46], w[47], offset); w[53] = amd_bytealign_S (w[45], w[46], offset); w[52] = amd_bytealign_S (w[44], w[45], offset); w[51] = amd_bytealign_S (w[43], w[44], offset); w[50] = amd_bytealign_S (w[42], w[43], offset); w[49] = amd_bytealign_S (w[41], w[42], offset); w[48] = amd_bytealign_S (w[40], w[41], offset); w[47] = amd_bytealign_S (w[39], w[40], offset); w[46] = amd_bytealign_S (w[38], w[39], offset); w[45] = amd_bytealign_S (w[37], w[38], offset); w[44] = amd_bytealign_S (w[36], w[37], offset); w[43] = amd_bytealign_S (w[35], w[36], offset); w[42] = amd_bytealign_S (w[34], w[35], offset); w[41] = amd_bytealign_S (w[33], w[34], offset); w[40] = amd_bytealign_S (w[32], w[33], offset); w[39] = amd_bytealign_S (w[31], w[32], offset); w[38] = amd_bytealign_S (w[30], w[31], offset); w[37] = amd_bytealign_S (w[29], w[30], offset); w[36] = amd_bytealign_S (w[28], w[29], offset); w[35] = amd_bytealign_S (w[27], w[28], offset); w[34] = amd_bytealign_S (w[26], w[27], offset); w[33] = amd_bytealign_S (w[25], w[26], offset); w[32] = amd_bytealign_S (w[24], w[25], offset); w[31] = amd_bytealign_S (w[23], w[24], offset); w[30] = amd_bytealign_S (w[22], w[23], offset); w[29] = amd_bytealign_S (w[21], w[22], offset); w[28] = amd_bytealign_S (w[20], w[21], offset); w[27] = amd_bytealign_S (w[19], w[20], offset); w[26] = amd_bytealign_S (w[18], w[19], offset); w[25] = amd_bytealign_S (w[17], w[18], offset); w[24] = amd_bytealign_S (w[16], w[17], offset); w[23] = amd_bytealign_S (w[15], w[16], offset); w[22] = amd_bytealign_S (w[14], w[15], offset); w[21] = amd_bytealign_S (w[13], w[14], offset); w[20] = amd_bytealign_S (w[12], w[13], offset); w[19] = amd_bytealign_S (w[11], w[12], offset); w[18] = amd_bytealign_S (w[10], w[11], offset); w[17] = amd_bytealign_S (w[ 9], w[10], offset); w[16] = amd_bytealign_S (w[ 8], w[ 9], offset); w[15] = amd_bytealign_S (w[ 7], w[ 8], offset); w[14] = amd_bytealign_S (w[ 6], w[ 7], offset); w[13] = amd_bytealign_S (w[ 5], w[ 6], offset); w[12] = amd_bytealign_S (w[ 4], w[ 5], offset); w[11] = amd_bytealign_S (w[ 3], w[ 4], offset); w[10] = amd_bytealign_S (w[ 2], w[ 3], offset); w[ 9] = amd_bytealign_S (w[ 1], w[ 2], offset); w[ 8] = amd_bytealign_S (w[ 0], w[ 1], offset); w[ 7] = amd_bytealign_S ( 0, w[ 0], offset); w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 8: w[63] = amd_bytealign_S (w[54], w[55], offset); w[62] = amd_bytealign_S (w[53], w[54], offset); w[61] = amd_bytealign_S (w[52], w[53], offset); w[60] = amd_bytealign_S (w[51], w[52], offset); w[59] = amd_bytealign_S (w[50], w[51], offset); w[58] = amd_bytealign_S (w[49], w[50], offset); w[57] = amd_bytealign_S (w[48], w[49], offset); w[56] = amd_bytealign_S (w[47], w[48], offset); w[55] = amd_bytealign_S (w[46], w[47], offset); w[54] = amd_bytealign_S (w[45], w[46], offset); w[53] = amd_bytealign_S (w[44], w[45], offset); w[52] = amd_bytealign_S (w[43], w[44], offset); w[51] = amd_bytealign_S (w[42], w[43], offset); w[50] = amd_bytealign_S (w[41], w[42], offset); w[49] = amd_bytealign_S (w[40], w[41], offset); w[48] = amd_bytealign_S (w[39], w[40], offset); w[47] = amd_bytealign_S (w[38], w[39], offset); w[46] = amd_bytealign_S (w[37], w[38], offset); w[45] = amd_bytealign_S (w[36], w[37], offset); w[44] = amd_bytealign_S (w[35], w[36], offset); w[43] = amd_bytealign_S (w[34], w[35], offset); w[42] = amd_bytealign_S (w[33], w[34], offset); w[41] = amd_bytealign_S (w[32], w[33], offset); w[40] = amd_bytealign_S (w[31], w[32], offset); w[39] = amd_bytealign_S (w[30], w[31], offset); w[38] = amd_bytealign_S (w[29], w[30], offset); w[37] = amd_bytealign_S (w[28], w[29], offset); w[36] = amd_bytealign_S (w[27], w[28], offset); w[35] = amd_bytealign_S (w[26], w[27], offset); w[34] = amd_bytealign_S (w[25], w[26], offset); w[33] = amd_bytealign_S (w[24], w[25], offset); w[32] = amd_bytealign_S (w[23], w[24], offset); w[31] = amd_bytealign_S (w[22], w[23], offset); w[30] = amd_bytealign_S (w[21], w[22], offset); w[29] = amd_bytealign_S (w[20], w[21], offset); w[28] = amd_bytealign_S (w[19], w[20], offset); w[27] = amd_bytealign_S (w[18], w[19], offset); w[26] = amd_bytealign_S (w[17], w[18], offset); w[25] = amd_bytealign_S (w[16], w[17], offset); w[24] = amd_bytealign_S (w[15], w[16], offset); w[23] = amd_bytealign_S (w[14], w[15], offset); w[22] = amd_bytealign_S (w[13], w[14], offset); w[21] = amd_bytealign_S (w[12], w[13], offset); w[20] = amd_bytealign_S (w[11], w[12], offset); w[19] = amd_bytealign_S (w[10], w[11], offset); w[18] = amd_bytealign_S (w[ 9], w[10], offset); w[17] = amd_bytealign_S (w[ 8], w[ 9], offset); w[16] = amd_bytealign_S (w[ 7], w[ 8], offset); w[15] = amd_bytealign_S (w[ 6], w[ 7], offset); w[14] = amd_bytealign_S (w[ 5], w[ 6], offset); w[13] = amd_bytealign_S (w[ 4], w[ 5], offset); w[12] = amd_bytealign_S (w[ 3], w[ 4], offset); w[11] = amd_bytealign_S (w[ 2], w[ 3], offset); w[10] = amd_bytealign_S (w[ 1], w[ 2], offset); w[ 9] = amd_bytealign_S (w[ 0], w[ 1], offset); w[ 8] = amd_bytealign_S ( 0, w[ 0], offset); w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 9: w[63] = amd_bytealign_S (w[53], w[54], offset); w[62] = amd_bytealign_S (w[52], w[53], offset); w[61] = amd_bytealign_S (w[51], w[52], offset); w[60] = amd_bytealign_S (w[50], w[51], offset); w[59] = amd_bytealign_S (w[49], w[50], offset); w[58] = amd_bytealign_S (w[48], w[49], offset); w[57] = amd_bytealign_S (w[47], w[48], offset); w[56] = amd_bytealign_S (w[46], w[47], offset); w[55] = amd_bytealign_S (w[45], w[46], offset); w[54] = amd_bytealign_S (w[44], w[45], offset); w[53] = amd_bytealign_S (w[43], w[44], offset); w[52] = amd_bytealign_S (w[42], w[43], offset); w[51] = amd_bytealign_S (w[41], w[42], offset); w[50] = amd_bytealign_S (w[40], w[41], offset); w[49] = amd_bytealign_S (w[39], w[40], offset); w[48] = amd_bytealign_S (w[38], w[39], offset); w[47] = amd_bytealign_S (w[37], w[38], offset); w[46] = amd_bytealign_S (w[36], w[37], offset); w[45] = amd_bytealign_S (w[35], w[36], offset); w[44] = amd_bytealign_S (w[34], w[35], offset); w[43] = amd_bytealign_S (w[33], w[34], offset); w[42] = amd_bytealign_S (w[32], w[33], offset); w[41] = amd_bytealign_S (w[31], w[32], offset); w[40] = amd_bytealign_S (w[30], w[31], offset); w[39] = amd_bytealign_S (w[29], w[30], offset); w[38] = amd_bytealign_S (w[28], w[29], offset); w[37] = amd_bytealign_S (w[27], w[28], offset); w[36] = amd_bytealign_S (w[26], w[27], offset); w[35] = amd_bytealign_S (w[25], w[26], offset); w[34] = amd_bytealign_S (w[24], w[25], offset); w[33] = amd_bytealign_S (w[23], w[24], offset); w[32] = amd_bytealign_S (w[22], w[23], offset); w[31] = amd_bytealign_S (w[21], w[22], offset); w[30] = amd_bytealign_S (w[20], w[21], offset); w[29] = amd_bytealign_S (w[19], w[20], offset); w[28] = amd_bytealign_S (w[18], w[19], offset); w[27] = amd_bytealign_S (w[17], w[18], offset); w[26] = amd_bytealign_S (w[16], w[17], offset); w[25] = amd_bytealign_S (w[15], w[16], offset); w[24] = amd_bytealign_S (w[14], w[15], offset); w[23] = amd_bytealign_S (w[13], w[14], offset); w[22] = amd_bytealign_S (w[12], w[13], offset); w[21] = amd_bytealign_S (w[11], w[12], offset); w[20] = amd_bytealign_S (w[10], w[11], offset); w[19] = amd_bytealign_S (w[ 9], w[10], offset); w[18] = amd_bytealign_S (w[ 8], w[ 9], offset); w[17] = amd_bytealign_S (w[ 7], w[ 8], offset); w[16] = amd_bytealign_S (w[ 6], w[ 7], offset); w[15] = amd_bytealign_S (w[ 5], w[ 6], offset); w[14] = amd_bytealign_S (w[ 4], w[ 5], offset); w[13] = amd_bytealign_S (w[ 3], w[ 4], offset); w[12] = amd_bytealign_S (w[ 2], w[ 3], offset); w[11] = amd_bytealign_S (w[ 1], w[ 2], offset); w[10] = amd_bytealign_S (w[ 0], w[ 1], offset); w[ 9] = amd_bytealign_S ( 0, w[ 0], offset); w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 10: w[63] = amd_bytealign_S (w[52], w[53], offset); w[62] = amd_bytealign_S (w[51], w[52], offset); w[61] = amd_bytealign_S (w[50], w[51], offset); w[60] = amd_bytealign_S (w[49], w[50], offset); w[59] = amd_bytealign_S (w[48], w[49], offset); w[58] = amd_bytealign_S (w[47], w[48], offset); w[57] = amd_bytealign_S (w[46], w[47], offset); w[56] = amd_bytealign_S (w[45], w[46], offset); w[55] = amd_bytealign_S (w[44], w[45], offset); w[54] = amd_bytealign_S (w[43], w[44], offset); w[53] = amd_bytealign_S (w[42], w[43], offset); w[52] = amd_bytealign_S (w[41], w[42], offset); w[51] = amd_bytealign_S (w[40], w[41], offset); w[50] = amd_bytealign_S (w[39], w[40], offset); w[49] = amd_bytealign_S (w[38], w[39], offset); w[48] = amd_bytealign_S (w[37], w[38], offset); w[47] = amd_bytealign_S (w[36], w[37], offset); w[46] = amd_bytealign_S (w[35], w[36], offset); w[45] = amd_bytealign_S (w[34], w[35], offset); w[44] = amd_bytealign_S (w[33], w[34], offset); w[43] = amd_bytealign_S (w[32], w[33], offset); w[42] = amd_bytealign_S (w[31], w[32], offset); w[41] = amd_bytealign_S (w[30], w[31], offset); w[40] = amd_bytealign_S (w[29], w[30], offset); w[39] = amd_bytealign_S (w[28], w[29], offset); w[38] = amd_bytealign_S (w[27], w[28], offset); w[37] = amd_bytealign_S (w[26], w[27], offset); w[36] = amd_bytealign_S (w[25], w[26], offset); w[35] = amd_bytealign_S (w[24], w[25], offset); w[34] = amd_bytealign_S (w[23], w[24], offset); w[33] = amd_bytealign_S (w[22], w[23], offset); w[32] = amd_bytealign_S (w[21], w[22], offset); w[31] = amd_bytealign_S (w[20], w[21], offset); w[30] = amd_bytealign_S (w[19], w[20], offset); w[29] = amd_bytealign_S (w[18], w[19], offset); w[28] = amd_bytealign_S (w[17], w[18], offset); w[27] = amd_bytealign_S (w[16], w[17], offset); w[26] = amd_bytealign_S (w[15], w[16], offset); w[25] = amd_bytealign_S (w[14], w[15], offset); w[24] = amd_bytealign_S (w[13], w[14], offset); w[23] = amd_bytealign_S (w[12], w[13], offset); w[22] = amd_bytealign_S (w[11], w[12], offset); w[21] = amd_bytealign_S (w[10], w[11], offset); w[20] = amd_bytealign_S (w[ 9], w[10], offset); w[19] = amd_bytealign_S (w[ 8], w[ 9], offset); w[18] = amd_bytealign_S (w[ 7], w[ 8], offset); w[17] = amd_bytealign_S (w[ 6], w[ 7], offset); w[16] = amd_bytealign_S (w[ 5], w[ 6], offset); w[15] = amd_bytealign_S (w[ 4], w[ 5], offset); w[14] = amd_bytealign_S (w[ 3], w[ 4], offset); w[13] = amd_bytealign_S (w[ 2], w[ 3], offset); w[12] = amd_bytealign_S (w[ 1], w[ 2], offset); w[11] = amd_bytealign_S (w[ 0], w[ 1], offset); w[10] = amd_bytealign_S ( 0, w[ 0], offset); w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 11: w[63] = amd_bytealign_S (w[51], w[52], offset); w[62] = amd_bytealign_S (w[50], w[51], offset); w[61] = amd_bytealign_S (w[49], w[50], offset); w[60] = amd_bytealign_S (w[48], w[49], offset); w[59] = amd_bytealign_S (w[47], w[48], offset); w[58] = amd_bytealign_S (w[46], w[47], offset); w[57] = amd_bytealign_S (w[45], w[46], offset); w[56] = amd_bytealign_S (w[44], w[45], offset); w[55] = amd_bytealign_S (w[43], w[44], offset); w[54] = amd_bytealign_S (w[42], w[43], offset); w[53] = amd_bytealign_S (w[41], w[42], offset); w[52] = amd_bytealign_S (w[40], w[41], offset); w[51] = amd_bytealign_S (w[39], w[40], offset); w[50] = amd_bytealign_S (w[38], w[39], offset); w[49] = amd_bytealign_S (w[37], w[38], offset); w[48] = amd_bytealign_S (w[36], w[37], offset); w[47] = amd_bytealign_S (w[35], w[36], offset); w[46] = amd_bytealign_S (w[34], w[35], offset); w[45] = amd_bytealign_S (w[33], w[34], offset); w[44] = amd_bytealign_S (w[32], w[33], offset); w[43] = amd_bytealign_S (w[31], w[32], offset); w[42] = amd_bytealign_S (w[30], w[31], offset); w[41] = amd_bytealign_S (w[29], w[30], offset); w[40] = amd_bytealign_S (w[28], w[29], offset); w[39] = amd_bytealign_S (w[27], w[28], offset); w[38] = amd_bytealign_S (w[26], w[27], offset); w[37] = amd_bytealign_S (w[25], w[26], offset); w[36] = amd_bytealign_S (w[24], w[25], offset); w[35] = amd_bytealign_S (w[23], w[24], offset); w[34] = amd_bytealign_S (w[22], w[23], offset); w[33] = amd_bytealign_S (w[21], w[22], offset); w[32] = amd_bytealign_S (w[20], w[21], offset); w[31] = amd_bytealign_S (w[19], w[20], offset); w[30] = amd_bytealign_S (w[18], w[19], offset); w[29] = amd_bytealign_S (w[17], w[18], offset); w[28] = amd_bytealign_S (w[16], w[17], offset); w[27] = amd_bytealign_S (w[15], w[16], offset); w[26] = amd_bytealign_S (w[14], w[15], offset); w[25] = amd_bytealign_S (w[13], w[14], offset); w[24] = amd_bytealign_S (w[12], w[13], offset); w[23] = amd_bytealign_S (w[11], w[12], offset); w[22] = amd_bytealign_S (w[10], w[11], offset); w[21] = amd_bytealign_S (w[ 9], w[10], offset); w[20] = amd_bytealign_S (w[ 8], w[ 9], offset); w[19] = amd_bytealign_S (w[ 7], w[ 8], offset); w[18] = amd_bytealign_S (w[ 6], w[ 7], offset); w[17] = amd_bytealign_S (w[ 5], w[ 6], offset); w[16] = amd_bytealign_S (w[ 4], w[ 5], offset); w[15] = amd_bytealign_S (w[ 3], w[ 4], offset); w[14] = amd_bytealign_S (w[ 2], w[ 3], offset); w[13] = amd_bytealign_S (w[ 1], w[ 2], offset); w[12] = amd_bytealign_S (w[ 0], w[ 1], offset); w[11] = amd_bytealign_S ( 0, w[ 0], offset); w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 12: w[63] = amd_bytealign_S (w[50], w[51], offset); w[62] = amd_bytealign_S (w[49], w[50], offset); w[61] = amd_bytealign_S (w[48], w[49], offset); w[60] = amd_bytealign_S (w[47], w[48], offset); w[59] = amd_bytealign_S (w[46], w[47], offset); w[58] = amd_bytealign_S (w[45], w[46], offset); w[57] = amd_bytealign_S (w[44], w[45], offset); w[56] = amd_bytealign_S (w[43], w[44], offset); w[55] = amd_bytealign_S (w[42], w[43], offset); w[54] = amd_bytealign_S (w[41], w[42], offset); w[53] = amd_bytealign_S (w[40], w[41], offset); w[52] = amd_bytealign_S (w[39], w[40], offset); w[51] = amd_bytealign_S (w[38], w[39], offset); w[50] = amd_bytealign_S (w[37], w[38], offset); w[49] = amd_bytealign_S (w[36], w[37], offset); w[48] = amd_bytealign_S (w[35], w[36], offset); w[47] = amd_bytealign_S (w[34], w[35], offset); w[46] = amd_bytealign_S (w[33], w[34], offset); w[45] = amd_bytealign_S (w[32], w[33], offset); w[44] = amd_bytealign_S (w[31], w[32], offset); w[43] = amd_bytealign_S (w[30], w[31], offset); w[42] = amd_bytealign_S (w[29], w[30], offset); w[41] = amd_bytealign_S (w[28], w[29], offset); w[40] = amd_bytealign_S (w[27], w[28], offset); w[39] = amd_bytealign_S (w[26], w[27], offset); w[38] = amd_bytealign_S (w[25], w[26], offset); w[37] = amd_bytealign_S (w[24], w[25], offset); w[36] = amd_bytealign_S (w[23], w[24], offset); w[35] = amd_bytealign_S (w[22], w[23], offset); w[34] = amd_bytealign_S (w[21], w[22], offset); w[33] = amd_bytealign_S (w[20], w[21], offset); w[32] = amd_bytealign_S (w[19], w[20], offset); w[31] = amd_bytealign_S (w[18], w[19], offset); w[30] = amd_bytealign_S (w[17], w[18], offset); w[29] = amd_bytealign_S (w[16], w[17], offset); w[28] = amd_bytealign_S (w[15], w[16], offset); w[27] = amd_bytealign_S (w[14], w[15], offset); w[26] = amd_bytealign_S (w[13], w[14], offset); w[25] = amd_bytealign_S (w[12], w[13], offset); w[24] = amd_bytealign_S (w[11], w[12], offset); w[23] = amd_bytealign_S (w[10], w[11], offset); w[22] = amd_bytealign_S (w[ 9], w[10], offset); w[21] = amd_bytealign_S (w[ 8], w[ 9], offset); w[20] = amd_bytealign_S (w[ 7], w[ 8], offset); w[19] = amd_bytealign_S (w[ 6], w[ 7], offset); w[18] = amd_bytealign_S (w[ 5], w[ 6], offset); w[17] = amd_bytealign_S (w[ 4], w[ 5], offset); w[16] = amd_bytealign_S (w[ 3], w[ 4], offset); w[15] = amd_bytealign_S (w[ 2], w[ 3], offset); w[14] = amd_bytealign_S (w[ 1], w[ 2], offset); w[13] = amd_bytealign_S (w[ 0], w[ 1], offset); w[12] = amd_bytealign_S ( 0, w[ 0], offset); w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 13: w[63] = amd_bytealign_S (w[49], w[50], offset); w[62] = amd_bytealign_S (w[48], w[49], offset); w[61] = amd_bytealign_S (w[47], w[48], offset); w[60] = amd_bytealign_S (w[46], w[47], offset); w[59] = amd_bytealign_S (w[45], w[46], offset); w[58] = amd_bytealign_S (w[44], w[45], offset); w[57] = amd_bytealign_S (w[43], w[44], offset); w[56] = amd_bytealign_S (w[42], w[43], offset); w[55] = amd_bytealign_S (w[41], w[42], offset); w[54] = amd_bytealign_S (w[40], w[41], offset); w[53] = amd_bytealign_S (w[39], w[40], offset); w[52] = amd_bytealign_S (w[38], w[39], offset); w[51] = amd_bytealign_S (w[37], w[38], offset); w[50] = amd_bytealign_S (w[36], w[37], offset); w[49] = amd_bytealign_S (w[35], w[36], offset); w[48] = amd_bytealign_S (w[34], w[35], offset); w[47] = amd_bytealign_S (w[33], w[34], offset); w[46] = amd_bytealign_S (w[32], w[33], offset); w[45] = amd_bytealign_S (w[31], w[32], offset); w[44] = amd_bytealign_S (w[30], w[31], offset); w[43] = amd_bytealign_S (w[29], w[30], offset); w[42] = amd_bytealign_S (w[28], w[29], offset); w[41] = amd_bytealign_S (w[27], w[28], offset); w[40] = amd_bytealign_S (w[26], w[27], offset); w[39] = amd_bytealign_S (w[25], w[26], offset); w[38] = amd_bytealign_S (w[24], w[25], offset); w[37] = amd_bytealign_S (w[23], w[24], offset); w[36] = amd_bytealign_S (w[22], w[23], offset); w[35] = amd_bytealign_S (w[21], w[22], offset); w[34] = amd_bytealign_S (w[20], w[21], offset); w[33] = amd_bytealign_S (w[19], w[20], offset); w[32] = amd_bytealign_S (w[18], w[19], offset); w[31] = amd_bytealign_S (w[17], w[18], offset); w[30] = amd_bytealign_S (w[16], w[17], offset); w[29] = amd_bytealign_S (w[15], w[16], offset); w[28] = amd_bytealign_S (w[14], w[15], offset); w[27] = amd_bytealign_S (w[13], w[14], offset); w[26] = amd_bytealign_S (w[12], w[13], offset); w[25] = amd_bytealign_S (w[11], w[12], offset); w[24] = amd_bytealign_S (w[10], w[11], offset); w[23] = amd_bytealign_S (w[ 9], w[10], offset); w[22] = amd_bytealign_S (w[ 8], w[ 9], offset); w[21] = amd_bytealign_S (w[ 7], w[ 8], offset); w[20] = amd_bytealign_S (w[ 6], w[ 7], offset); w[19] = amd_bytealign_S (w[ 5], w[ 6], offset); w[18] = amd_bytealign_S (w[ 4], w[ 5], offset); w[17] = amd_bytealign_S (w[ 3], w[ 4], offset); w[16] = amd_bytealign_S (w[ 2], w[ 3], offset); w[15] = amd_bytealign_S (w[ 1], w[ 2], offset); w[14] = amd_bytealign_S (w[ 0], w[ 1], offset); w[13] = amd_bytealign_S ( 0, w[ 0], offset); w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 14: w[63] = amd_bytealign_S (w[48], w[49], offset); w[62] = amd_bytealign_S (w[47], w[48], offset); w[61] = amd_bytealign_S (w[46], w[47], offset); w[60] = amd_bytealign_S (w[45], w[46], offset); w[59] = amd_bytealign_S (w[44], w[45], offset); w[58] = amd_bytealign_S (w[43], w[44], offset); w[57] = amd_bytealign_S (w[42], w[43], offset); w[56] = amd_bytealign_S (w[41], w[42], offset); w[55] = amd_bytealign_S (w[40], w[41], offset); w[54] = amd_bytealign_S (w[39], w[40], offset); w[53] = amd_bytealign_S (w[38], w[39], offset); w[52] = amd_bytealign_S (w[37], w[38], offset); w[51] = amd_bytealign_S (w[36], w[37], offset); w[50] = amd_bytealign_S (w[35], w[36], offset); w[49] = amd_bytealign_S (w[34], w[35], offset); w[48] = amd_bytealign_S (w[33], w[34], offset); w[47] = amd_bytealign_S (w[32], w[33], offset); w[46] = amd_bytealign_S (w[31], w[32], offset); w[45] = amd_bytealign_S (w[30], w[31], offset); w[44] = amd_bytealign_S (w[29], w[30], offset); w[43] = amd_bytealign_S (w[28], w[29], offset); w[42] = amd_bytealign_S (w[27], w[28], offset); w[41] = amd_bytealign_S (w[26], w[27], offset); w[40] = amd_bytealign_S (w[25], w[26], offset); w[39] = amd_bytealign_S (w[24], w[25], offset); w[38] = amd_bytealign_S (w[23], w[24], offset); w[37] = amd_bytealign_S (w[22], w[23], offset); w[36] = amd_bytealign_S (w[21], w[22], offset); w[35] = amd_bytealign_S (w[20], w[21], offset); w[34] = amd_bytealign_S (w[19], w[20], offset); w[33] = amd_bytealign_S (w[18], w[19], offset); w[32] = amd_bytealign_S (w[17], w[18], offset); w[31] = amd_bytealign_S (w[16], w[17], offset); w[30] = amd_bytealign_S (w[15], w[16], offset); w[29] = amd_bytealign_S (w[14], w[15], offset); w[28] = amd_bytealign_S (w[13], w[14], offset); w[27] = amd_bytealign_S (w[12], w[13], offset); w[26] = amd_bytealign_S (w[11], w[12], offset); w[25] = amd_bytealign_S (w[10], w[11], offset); w[24] = amd_bytealign_S (w[ 9], w[10], offset); w[23] = amd_bytealign_S (w[ 8], w[ 9], offset); w[22] = amd_bytealign_S (w[ 7], w[ 8], offset); w[21] = amd_bytealign_S (w[ 6], w[ 7], offset); w[20] = amd_bytealign_S (w[ 5], w[ 6], offset); w[19] = amd_bytealign_S (w[ 4], w[ 5], offset); w[18] = amd_bytealign_S (w[ 3], w[ 4], offset); w[17] = amd_bytealign_S (w[ 2], w[ 3], offset); w[16] = amd_bytealign_S (w[ 1], w[ 2], offset); w[15] = amd_bytealign_S (w[ 0], w[ 1], offset); w[14] = amd_bytealign_S ( 0, w[ 0], offset); w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 15: w[63] = amd_bytealign_S (w[47], w[48], offset); w[62] = amd_bytealign_S (w[46], w[47], offset); w[61] = amd_bytealign_S (w[45], w[46], offset); w[60] = amd_bytealign_S (w[44], w[45], offset); w[59] = amd_bytealign_S (w[43], w[44], offset); w[58] = amd_bytealign_S (w[42], w[43], offset); w[57] = amd_bytealign_S (w[41], w[42], offset); w[56] = amd_bytealign_S (w[40], w[41], offset); w[55] = amd_bytealign_S (w[39], w[40], offset); w[54] = amd_bytealign_S (w[38], w[39], offset); w[53] = amd_bytealign_S (w[37], w[38], offset); w[52] = amd_bytealign_S (w[36], w[37], offset); w[51] = amd_bytealign_S (w[35], w[36], offset); w[50] = amd_bytealign_S (w[34], w[35], offset); w[49] = amd_bytealign_S (w[33], w[34], offset); w[48] = amd_bytealign_S (w[32], w[33], offset); w[47] = amd_bytealign_S (w[31], w[32], offset); w[46] = amd_bytealign_S (w[30], w[31], offset); w[45] = amd_bytealign_S (w[29], w[30], offset); w[44] = amd_bytealign_S (w[28], w[29], offset); w[43] = amd_bytealign_S (w[27], w[28], offset); w[42] = amd_bytealign_S (w[26], w[27], offset); w[41] = amd_bytealign_S (w[25], w[26], offset); w[40] = amd_bytealign_S (w[24], w[25], offset); w[39] = amd_bytealign_S (w[23], w[24], offset); w[38] = amd_bytealign_S (w[22], w[23], offset); w[37] = amd_bytealign_S (w[21], w[22], offset); w[36] = amd_bytealign_S (w[20], w[21], offset); w[35] = amd_bytealign_S (w[19], w[20], offset); w[34] = amd_bytealign_S (w[18], w[19], offset); w[33] = amd_bytealign_S (w[17], w[18], offset); w[32] = amd_bytealign_S (w[16], w[17], offset); w[31] = amd_bytealign_S (w[15], w[16], offset); w[30] = amd_bytealign_S (w[14], w[15], offset); w[29] = amd_bytealign_S (w[13], w[14], offset); w[28] = amd_bytealign_S (w[12], w[13], offset); w[27] = amd_bytealign_S (w[11], w[12], offset); w[26] = amd_bytealign_S (w[10], w[11], offset); w[25] = amd_bytealign_S (w[ 9], w[10], offset); w[24] = amd_bytealign_S (w[ 8], w[ 9], offset); w[23] = amd_bytealign_S (w[ 7], w[ 8], offset); w[22] = amd_bytealign_S (w[ 6], w[ 7], offset); w[21] = amd_bytealign_S (w[ 5], w[ 6], offset); w[20] = amd_bytealign_S (w[ 4], w[ 5], offset); w[19] = amd_bytealign_S (w[ 3], w[ 4], offset); w[18] = amd_bytealign_S (w[ 2], w[ 3], offset); w[17] = amd_bytealign_S (w[ 1], w[ 2], offset); w[16] = amd_bytealign_S (w[ 0], w[ 1], offset); w[15] = amd_bytealign_S ( 0, w[ 0], offset); w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 16: w[63] = amd_bytealign_S (w[46], w[47], offset); w[62] = amd_bytealign_S (w[45], w[46], offset); w[61] = amd_bytealign_S (w[44], w[45], offset); w[60] = amd_bytealign_S (w[43], w[44], offset); w[59] = amd_bytealign_S (w[42], w[43], offset); w[58] = amd_bytealign_S (w[41], w[42], offset); w[57] = amd_bytealign_S (w[40], w[41], offset); w[56] = amd_bytealign_S (w[39], w[40], offset); w[55] = amd_bytealign_S (w[38], w[39], offset); w[54] = amd_bytealign_S (w[37], w[38], offset); w[53] = amd_bytealign_S (w[36], w[37], offset); w[52] = amd_bytealign_S (w[35], w[36], offset); w[51] = amd_bytealign_S (w[34], w[35], offset); w[50] = amd_bytealign_S (w[33], w[34], offset); w[49] = amd_bytealign_S (w[32], w[33], offset); w[48] = amd_bytealign_S (w[31], w[32], offset); w[47] = amd_bytealign_S (w[30], w[31], offset); w[46] = amd_bytealign_S (w[29], w[30], offset); w[45] = amd_bytealign_S (w[28], w[29], offset); w[44] = amd_bytealign_S (w[27], w[28], offset); w[43] = amd_bytealign_S (w[26], w[27], offset); w[42] = amd_bytealign_S (w[25], w[26], offset); w[41] = amd_bytealign_S (w[24], w[25], offset); w[40] = amd_bytealign_S (w[23], w[24], offset); w[39] = amd_bytealign_S (w[22], w[23], offset); w[38] = amd_bytealign_S (w[21], w[22], offset); w[37] = amd_bytealign_S (w[20], w[21], offset); w[36] = amd_bytealign_S (w[19], w[20], offset); w[35] = amd_bytealign_S (w[18], w[19], offset); w[34] = amd_bytealign_S (w[17], w[18], offset); w[33] = amd_bytealign_S (w[16], w[17], offset); w[32] = amd_bytealign_S (w[15], w[16], offset); w[31] = amd_bytealign_S (w[14], w[15], offset); w[30] = amd_bytealign_S (w[13], w[14], offset); w[29] = amd_bytealign_S (w[12], w[13], offset); w[28] = amd_bytealign_S (w[11], w[12], offset); w[27] = amd_bytealign_S (w[10], w[11], offset); w[26] = amd_bytealign_S (w[ 9], w[10], offset); w[25] = amd_bytealign_S (w[ 8], w[ 9], offset); w[24] = amd_bytealign_S (w[ 7], w[ 8], offset); w[23] = amd_bytealign_S (w[ 6], w[ 7], offset); w[22] = amd_bytealign_S (w[ 5], w[ 6], offset); w[21] = amd_bytealign_S (w[ 4], w[ 5], offset); w[20] = amd_bytealign_S (w[ 3], w[ 4], offset); w[19] = amd_bytealign_S (w[ 2], w[ 3], offset); w[18] = amd_bytealign_S (w[ 1], w[ 2], offset); w[17] = amd_bytealign_S (w[ 0], w[ 1], offset); w[16] = amd_bytealign_S ( 0, w[ 0], offset); w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 17: w[63] = amd_bytealign_S (w[45], w[46], offset); w[62] = amd_bytealign_S (w[44], w[45], offset); w[61] = amd_bytealign_S (w[43], w[44], offset); w[60] = amd_bytealign_S (w[42], w[43], offset); w[59] = amd_bytealign_S (w[41], w[42], offset); w[58] = amd_bytealign_S (w[40], w[41], offset); w[57] = amd_bytealign_S (w[39], w[40], offset); w[56] = amd_bytealign_S (w[38], w[39], offset); w[55] = amd_bytealign_S (w[37], w[38], offset); w[54] = amd_bytealign_S (w[36], w[37], offset); w[53] = amd_bytealign_S (w[35], w[36], offset); w[52] = amd_bytealign_S (w[34], w[35], offset); w[51] = amd_bytealign_S (w[33], w[34], offset); w[50] = amd_bytealign_S (w[32], w[33], offset); w[49] = amd_bytealign_S (w[31], w[32], offset); w[48] = amd_bytealign_S (w[30], w[31], offset); w[47] = amd_bytealign_S (w[29], w[30], offset); w[46] = amd_bytealign_S (w[28], w[29], offset); w[45] = amd_bytealign_S (w[27], w[28], offset); w[44] = amd_bytealign_S (w[26], w[27], offset); w[43] = amd_bytealign_S (w[25], w[26], offset); w[42] = amd_bytealign_S (w[24], w[25], offset); w[41] = amd_bytealign_S (w[23], w[24], offset); w[40] = amd_bytealign_S (w[22], w[23], offset); w[39] = amd_bytealign_S (w[21], w[22], offset); w[38] = amd_bytealign_S (w[20], w[21], offset); w[37] = amd_bytealign_S (w[19], w[20], offset); w[36] = amd_bytealign_S (w[18], w[19], offset); w[35] = amd_bytealign_S (w[17], w[18], offset); w[34] = amd_bytealign_S (w[16], w[17], offset); w[33] = amd_bytealign_S (w[15], w[16], offset); w[32] = amd_bytealign_S (w[14], w[15], offset); w[31] = amd_bytealign_S (w[13], w[14], offset); w[30] = amd_bytealign_S (w[12], w[13], offset); w[29] = amd_bytealign_S (w[11], w[12], offset); w[28] = amd_bytealign_S (w[10], w[11], offset); w[27] = amd_bytealign_S (w[ 9], w[10], offset); w[26] = amd_bytealign_S (w[ 8], w[ 9], offset); w[25] = amd_bytealign_S (w[ 7], w[ 8], offset); w[24] = amd_bytealign_S (w[ 6], w[ 7], offset); w[23] = amd_bytealign_S (w[ 5], w[ 6], offset); w[22] = amd_bytealign_S (w[ 4], w[ 5], offset); w[21] = amd_bytealign_S (w[ 3], w[ 4], offset); w[20] = amd_bytealign_S (w[ 2], w[ 3], offset); w[19] = amd_bytealign_S (w[ 1], w[ 2], offset); w[18] = amd_bytealign_S (w[ 0], w[ 1], offset); w[17] = amd_bytealign_S ( 0, w[ 0], offset); w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 18: w[63] = amd_bytealign_S (w[44], w[45], offset); w[62] = amd_bytealign_S (w[43], w[44], offset); w[61] = amd_bytealign_S (w[42], w[43], offset); w[60] = amd_bytealign_S (w[41], w[42], offset); w[59] = amd_bytealign_S (w[40], w[41], offset); w[58] = amd_bytealign_S (w[39], w[40], offset); w[57] = amd_bytealign_S (w[38], w[39], offset); w[56] = amd_bytealign_S (w[37], w[38], offset); w[55] = amd_bytealign_S (w[36], w[37], offset); w[54] = amd_bytealign_S (w[35], w[36], offset); w[53] = amd_bytealign_S (w[34], w[35], offset); w[52] = amd_bytealign_S (w[33], w[34], offset); w[51] = amd_bytealign_S (w[32], w[33], offset); w[50] = amd_bytealign_S (w[31], w[32], offset); w[49] = amd_bytealign_S (w[30], w[31], offset); w[48] = amd_bytealign_S (w[29], w[30], offset); w[47] = amd_bytealign_S (w[28], w[29], offset); w[46] = amd_bytealign_S (w[27], w[28], offset); w[45] = amd_bytealign_S (w[26], w[27], offset); w[44] = amd_bytealign_S (w[25], w[26], offset); w[43] = amd_bytealign_S (w[24], w[25], offset); w[42] = amd_bytealign_S (w[23], w[24], offset); w[41] = amd_bytealign_S (w[22], w[23], offset); w[40] = amd_bytealign_S (w[21], w[22], offset); w[39] = amd_bytealign_S (w[20], w[21], offset); w[38] = amd_bytealign_S (w[19], w[20], offset); w[37] = amd_bytealign_S (w[18], w[19], offset); w[36] = amd_bytealign_S (w[17], w[18], offset); w[35] = amd_bytealign_S (w[16], w[17], offset); w[34] = amd_bytealign_S (w[15], w[16], offset); w[33] = amd_bytealign_S (w[14], w[15], offset); w[32] = amd_bytealign_S (w[13], w[14], offset); w[31] = amd_bytealign_S (w[12], w[13], offset); w[30] = amd_bytealign_S (w[11], w[12], offset); w[29] = amd_bytealign_S (w[10], w[11], offset); w[28] = amd_bytealign_S (w[ 9], w[10], offset); w[27] = amd_bytealign_S (w[ 8], w[ 9], offset); w[26] = amd_bytealign_S (w[ 7], w[ 8], offset); w[25] = amd_bytealign_S (w[ 6], w[ 7], offset); w[24] = amd_bytealign_S (w[ 5], w[ 6], offset); w[23] = amd_bytealign_S (w[ 4], w[ 5], offset); w[22] = amd_bytealign_S (w[ 3], w[ 4], offset); w[21] = amd_bytealign_S (w[ 2], w[ 3], offset); w[20] = amd_bytealign_S (w[ 1], w[ 2], offset); w[19] = amd_bytealign_S (w[ 0], w[ 1], offset); w[18] = amd_bytealign_S ( 0, w[ 0], offset); w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 19: w[63] = amd_bytealign_S (w[43], w[44], offset); w[62] = amd_bytealign_S (w[42], w[43], offset); w[61] = amd_bytealign_S (w[41], w[42], offset); w[60] = amd_bytealign_S (w[40], w[41], offset); w[59] = amd_bytealign_S (w[39], w[40], offset); w[58] = amd_bytealign_S (w[38], w[39], offset); w[57] = amd_bytealign_S (w[37], w[38], offset); w[56] = amd_bytealign_S (w[36], w[37], offset); w[55] = amd_bytealign_S (w[35], w[36], offset); w[54] = amd_bytealign_S (w[34], w[35], offset); w[53] = amd_bytealign_S (w[33], w[34], offset); w[52] = amd_bytealign_S (w[32], w[33], offset); w[51] = amd_bytealign_S (w[31], w[32], offset); w[50] = amd_bytealign_S (w[30], w[31], offset); w[49] = amd_bytealign_S (w[29], w[30], offset); w[48] = amd_bytealign_S (w[28], w[29], offset); w[47] = amd_bytealign_S (w[27], w[28], offset); w[46] = amd_bytealign_S (w[26], w[27], offset); w[45] = amd_bytealign_S (w[25], w[26], offset); w[44] = amd_bytealign_S (w[24], w[25], offset); w[43] = amd_bytealign_S (w[23], w[24], offset); w[42] = amd_bytealign_S (w[22], w[23], offset); w[41] = amd_bytealign_S (w[21], w[22], offset); w[40] = amd_bytealign_S (w[20], w[21], offset); w[39] = amd_bytealign_S (w[19], w[20], offset); w[38] = amd_bytealign_S (w[18], w[19], offset); w[37] = amd_bytealign_S (w[17], w[18], offset); w[36] = amd_bytealign_S (w[16], w[17], offset); w[35] = amd_bytealign_S (w[15], w[16], offset); w[34] = amd_bytealign_S (w[14], w[15], offset); w[33] = amd_bytealign_S (w[13], w[14], offset); w[32] = amd_bytealign_S (w[12], w[13], offset); w[31] = amd_bytealign_S (w[11], w[12], offset); w[30] = amd_bytealign_S (w[10], w[11], offset); w[29] = amd_bytealign_S (w[ 9], w[10], offset); w[28] = amd_bytealign_S (w[ 8], w[ 9], offset); w[27] = amd_bytealign_S (w[ 7], w[ 8], offset); w[26] = amd_bytealign_S (w[ 6], w[ 7], offset); w[25] = amd_bytealign_S (w[ 5], w[ 6], offset); w[24] = amd_bytealign_S (w[ 4], w[ 5], offset); w[23] = amd_bytealign_S (w[ 3], w[ 4], offset); w[22] = amd_bytealign_S (w[ 2], w[ 3], offset); w[21] = amd_bytealign_S (w[ 1], w[ 2], offset); w[20] = amd_bytealign_S (w[ 0], w[ 1], offset); w[19] = amd_bytealign_S ( 0, w[ 0], offset); w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 20: w[63] = amd_bytealign_S (w[42], w[43], offset); w[62] = amd_bytealign_S (w[41], w[42], offset); w[61] = amd_bytealign_S (w[40], w[41], offset); w[60] = amd_bytealign_S (w[39], w[40], offset); w[59] = amd_bytealign_S (w[38], w[39], offset); w[58] = amd_bytealign_S (w[37], w[38], offset); w[57] = amd_bytealign_S (w[36], w[37], offset); w[56] = amd_bytealign_S (w[35], w[36], offset); w[55] = amd_bytealign_S (w[34], w[35], offset); w[54] = amd_bytealign_S (w[33], w[34], offset); w[53] = amd_bytealign_S (w[32], w[33], offset); w[52] = amd_bytealign_S (w[31], w[32], offset); w[51] = amd_bytealign_S (w[30], w[31], offset); w[50] = amd_bytealign_S (w[29], w[30], offset); w[49] = amd_bytealign_S (w[28], w[29], offset); w[48] = amd_bytealign_S (w[27], w[28], offset); w[47] = amd_bytealign_S (w[26], w[27], offset); w[46] = amd_bytealign_S (w[25], w[26], offset); w[45] = amd_bytealign_S (w[24], w[25], offset); w[44] = amd_bytealign_S (w[23], w[24], offset); w[43] = amd_bytealign_S (w[22], w[23], offset); w[42] = amd_bytealign_S (w[21], w[22], offset); w[41] = amd_bytealign_S (w[20], w[21], offset); w[40] = amd_bytealign_S (w[19], w[20], offset); w[39] = amd_bytealign_S (w[18], w[19], offset); w[38] = amd_bytealign_S (w[17], w[18], offset); w[37] = amd_bytealign_S (w[16], w[17], offset); w[36] = amd_bytealign_S (w[15], w[16], offset); w[35] = amd_bytealign_S (w[14], w[15], offset); w[34] = amd_bytealign_S (w[13], w[14], offset); w[33] = amd_bytealign_S (w[12], w[13], offset); w[32] = amd_bytealign_S (w[11], w[12], offset); w[31] = amd_bytealign_S (w[10], w[11], offset); w[30] = amd_bytealign_S (w[ 9], w[10], offset); w[29] = amd_bytealign_S (w[ 8], w[ 9], offset); w[28] = amd_bytealign_S (w[ 7], w[ 8], offset); w[27] = amd_bytealign_S (w[ 6], w[ 7], offset); w[26] = amd_bytealign_S (w[ 5], w[ 6], offset); w[25] = amd_bytealign_S (w[ 4], w[ 5], offset); w[24] = amd_bytealign_S (w[ 3], w[ 4], offset); w[23] = amd_bytealign_S (w[ 2], w[ 3], offset); w[22] = amd_bytealign_S (w[ 1], w[ 2], offset); w[21] = amd_bytealign_S (w[ 0], w[ 1], offset); w[20] = amd_bytealign_S ( 0, w[ 0], offset); w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 21: w[63] = amd_bytealign_S (w[41], w[42], offset); w[62] = amd_bytealign_S (w[40], w[41], offset); w[61] = amd_bytealign_S (w[39], w[40], offset); w[60] = amd_bytealign_S (w[38], w[39], offset); w[59] = amd_bytealign_S (w[37], w[38], offset); w[58] = amd_bytealign_S (w[36], w[37], offset); w[57] = amd_bytealign_S (w[35], w[36], offset); w[56] = amd_bytealign_S (w[34], w[35], offset); w[55] = amd_bytealign_S (w[33], w[34], offset); w[54] = amd_bytealign_S (w[32], w[33], offset); w[53] = amd_bytealign_S (w[31], w[32], offset); w[52] = amd_bytealign_S (w[30], w[31], offset); w[51] = amd_bytealign_S (w[29], w[30], offset); w[50] = amd_bytealign_S (w[28], w[29], offset); w[49] = amd_bytealign_S (w[27], w[28], offset); w[48] = amd_bytealign_S (w[26], w[27], offset); w[47] = amd_bytealign_S (w[25], w[26], offset); w[46] = amd_bytealign_S (w[24], w[25], offset); w[45] = amd_bytealign_S (w[23], w[24], offset); w[44] = amd_bytealign_S (w[22], w[23], offset); w[43] = amd_bytealign_S (w[21], w[22], offset); w[42] = amd_bytealign_S (w[20], w[21], offset); w[41] = amd_bytealign_S (w[19], w[20], offset); w[40] = amd_bytealign_S (w[18], w[19], offset); w[39] = amd_bytealign_S (w[17], w[18], offset); w[38] = amd_bytealign_S (w[16], w[17], offset); w[37] = amd_bytealign_S (w[15], w[16], offset); w[36] = amd_bytealign_S (w[14], w[15], offset); w[35] = amd_bytealign_S (w[13], w[14], offset); w[34] = amd_bytealign_S (w[12], w[13], offset); w[33] = amd_bytealign_S (w[11], w[12], offset); w[32] = amd_bytealign_S (w[10], w[11], offset); w[31] = amd_bytealign_S (w[ 9], w[10], offset); w[30] = amd_bytealign_S (w[ 8], w[ 9], offset); w[29] = amd_bytealign_S (w[ 7], w[ 8], offset); w[28] = amd_bytealign_S (w[ 6], w[ 7], offset); w[27] = amd_bytealign_S (w[ 5], w[ 6], offset); w[26] = amd_bytealign_S (w[ 4], w[ 5], offset); w[25] = amd_bytealign_S (w[ 3], w[ 4], offset); w[24] = amd_bytealign_S (w[ 2], w[ 3], offset); w[23] = amd_bytealign_S (w[ 1], w[ 2], offset); w[22] = amd_bytealign_S (w[ 0], w[ 1], offset); w[21] = amd_bytealign_S ( 0, w[ 0], offset); w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 22: w[63] = amd_bytealign_S (w[40], w[41], offset); w[62] = amd_bytealign_S (w[39], w[40], offset); w[61] = amd_bytealign_S (w[38], w[39], offset); w[60] = amd_bytealign_S (w[37], w[38], offset); w[59] = amd_bytealign_S (w[36], w[37], offset); w[58] = amd_bytealign_S (w[35], w[36], offset); w[57] = amd_bytealign_S (w[34], w[35], offset); w[56] = amd_bytealign_S (w[33], w[34], offset); w[55] = amd_bytealign_S (w[32], w[33], offset); w[54] = amd_bytealign_S (w[31], w[32], offset); w[53] = amd_bytealign_S (w[30], w[31], offset); w[52] = amd_bytealign_S (w[29], w[30], offset); w[51] = amd_bytealign_S (w[28], w[29], offset); w[50] = amd_bytealign_S (w[27], w[28], offset); w[49] = amd_bytealign_S (w[26], w[27], offset); w[48] = amd_bytealign_S (w[25], w[26], offset); w[47] = amd_bytealign_S (w[24], w[25], offset); w[46] = amd_bytealign_S (w[23], w[24], offset); w[45] = amd_bytealign_S (w[22], w[23], offset); w[44] = amd_bytealign_S (w[21], w[22], offset); w[43] = amd_bytealign_S (w[20], w[21], offset); w[42] = amd_bytealign_S (w[19], w[20], offset); w[41] = amd_bytealign_S (w[18], w[19], offset); w[40] = amd_bytealign_S (w[17], w[18], offset); w[39] = amd_bytealign_S (w[16], w[17], offset); w[38] = amd_bytealign_S (w[15], w[16], offset); w[37] = amd_bytealign_S (w[14], w[15], offset); w[36] = amd_bytealign_S (w[13], w[14], offset); w[35] = amd_bytealign_S (w[12], w[13], offset); w[34] = amd_bytealign_S (w[11], w[12], offset); w[33] = amd_bytealign_S (w[10], w[11], offset); w[32] = amd_bytealign_S (w[ 9], w[10], offset); w[31] = amd_bytealign_S (w[ 8], w[ 9], offset); w[30] = amd_bytealign_S (w[ 7], w[ 8], offset); w[29] = amd_bytealign_S (w[ 6], w[ 7], offset); w[28] = amd_bytealign_S (w[ 5], w[ 6], offset); w[27] = amd_bytealign_S (w[ 4], w[ 5], offset); w[26] = amd_bytealign_S (w[ 3], w[ 4], offset); w[25] = amd_bytealign_S (w[ 2], w[ 3], offset); w[24] = amd_bytealign_S (w[ 1], w[ 2], offset); w[23] = amd_bytealign_S (w[ 0], w[ 1], offset); w[22] = amd_bytealign_S ( 0, w[ 0], offset); w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 23: w[63] = amd_bytealign_S (w[39], w[40], offset); w[62] = amd_bytealign_S (w[38], w[39], offset); w[61] = amd_bytealign_S (w[37], w[38], offset); w[60] = amd_bytealign_S (w[36], w[37], offset); w[59] = amd_bytealign_S (w[35], w[36], offset); w[58] = amd_bytealign_S (w[34], w[35], offset); w[57] = amd_bytealign_S (w[33], w[34], offset); w[56] = amd_bytealign_S (w[32], w[33], offset); w[55] = amd_bytealign_S (w[31], w[32], offset); w[54] = amd_bytealign_S (w[30], w[31], offset); w[53] = amd_bytealign_S (w[29], w[30], offset); w[52] = amd_bytealign_S (w[28], w[29], offset); w[51] = amd_bytealign_S (w[27], w[28], offset); w[50] = amd_bytealign_S (w[26], w[27], offset); w[49] = amd_bytealign_S (w[25], w[26], offset); w[48] = amd_bytealign_S (w[24], w[25], offset); w[47] = amd_bytealign_S (w[23], w[24], offset); w[46] = amd_bytealign_S (w[22], w[23], offset); w[45] = amd_bytealign_S (w[21], w[22], offset); w[44] = amd_bytealign_S (w[20], w[21], offset); w[43] = amd_bytealign_S (w[19], w[20], offset); w[42] = amd_bytealign_S (w[18], w[19], offset); w[41] = amd_bytealign_S (w[17], w[18], offset); w[40] = amd_bytealign_S (w[16], w[17], offset); w[39] = amd_bytealign_S (w[15], w[16], offset); w[38] = amd_bytealign_S (w[14], w[15], offset); w[37] = amd_bytealign_S (w[13], w[14], offset); w[36] = amd_bytealign_S (w[12], w[13], offset); w[35] = amd_bytealign_S (w[11], w[12], offset); w[34] = amd_bytealign_S (w[10], w[11], offset); w[33] = amd_bytealign_S (w[ 9], w[10], offset); w[32] = amd_bytealign_S (w[ 8], w[ 9], offset); w[31] = amd_bytealign_S (w[ 7], w[ 8], offset); w[30] = amd_bytealign_S (w[ 6], w[ 7], offset); w[29] = amd_bytealign_S (w[ 5], w[ 6], offset); w[28] = amd_bytealign_S (w[ 4], w[ 5], offset); w[27] = amd_bytealign_S (w[ 3], w[ 4], offset); w[26] = amd_bytealign_S (w[ 2], w[ 3], offset); w[25] = amd_bytealign_S (w[ 1], w[ 2], offset); w[24] = amd_bytealign_S (w[ 0], w[ 1], offset); w[23] = amd_bytealign_S ( 0, w[ 0], offset); w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 24: w[63] = amd_bytealign_S (w[38], w[39], offset); w[62] = amd_bytealign_S (w[37], w[38], offset); w[61] = amd_bytealign_S (w[36], w[37], offset); w[60] = amd_bytealign_S (w[35], w[36], offset); w[59] = amd_bytealign_S (w[34], w[35], offset); w[58] = amd_bytealign_S (w[33], w[34], offset); w[57] = amd_bytealign_S (w[32], w[33], offset); w[56] = amd_bytealign_S (w[31], w[32], offset); w[55] = amd_bytealign_S (w[30], w[31], offset); w[54] = amd_bytealign_S (w[29], w[30], offset); w[53] = amd_bytealign_S (w[28], w[29], offset); w[52] = amd_bytealign_S (w[27], w[28], offset); w[51] = amd_bytealign_S (w[26], w[27], offset); w[50] = amd_bytealign_S (w[25], w[26], offset); w[49] = amd_bytealign_S (w[24], w[25], offset); w[48] = amd_bytealign_S (w[23], w[24], offset); w[47] = amd_bytealign_S (w[22], w[23], offset); w[46] = amd_bytealign_S (w[21], w[22], offset); w[45] = amd_bytealign_S (w[20], w[21], offset); w[44] = amd_bytealign_S (w[19], w[20], offset); w[43] = amd_bytealign_S (w[18], w[19], offset); w[42] = amd_bytealign_S (w[17], w[18], offset); w[41] = amd_bytealign_S (w[16], w[17], offset); w[40] = amd_bytealign_S (w[15], w[16], offset); w[39] = amd_bytealign_S (w[14], w[15], offset); w[38] = amd_bytealign_S (w[13], w[14], offset); w[37] = amd_bytealign_S (w[12], w[13], offset); w[36] = amd_bytealign_S (w[11], w[12], offset); w[35] = amd_bytealign_S (w[10], w[11], offset); w[34] = amd_bytealign_S (w[ 9], w[10], offset); w[33] = amd_bytealign_S (w[ 8], w[ 9], offset); w[32] = amd_bytealign_S (w[ 7], w[ 8], offset); w[31] = amd_bytealign_S (w[ 6], w[ 7], offset); w[30] = amd_bytealign_S (w[ 5], w[ 6], offset); w[29] = amd_bytealign_S (w[ 4], w[ 5], offset); w[28] = amd_bytealign_S (w[ 3], w[ 4], offset); w[27] = amd_bytealign_S (w[ 2], w[ 3], offset); w[26] = amd_bytealign_S (w[ 1], w[ 2], offset); w[25] = amd_bytealign_S (w[ 0], w[ 1], offset); w[24] = amd_bytealign_S ( 0, w[ 0], offset); w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 25: w[63] = amd_bytealign_S (w[37], w[38], offset); w[62] = amd_bytealign_S (w[36], w[37], offset); w[61] = amd_bytealign_S (w[35], w[36], offset); w[60] = amd_bytealign_S (w[34], w[35], offset); w[59] = amd_bytealign_S (w[33], w[34], offset); w[58] = amd_bytealign_S (w[32], w[33], offset); w[57] = amd_bytealign_S (w[31], w[32], offset); w[56] = amd_bytealign_S (w[30], w[31], offset); w[55] = amd_bytealign_S (w[29], w[30], offset); w[54] = amd_bytealign_S (w[28], w[29], offset); w[53] = amd_bytealign_S (w[27], w[28], offset); w[52] = amd_bytealign_S (w[26], w[27], offset); w[51] = amd_bytealign_S (w[25], w[26], offset); w[50] = amd_bytealign_S (w[24], w[25], offset); w[49] = amd_bytealign_S (w[23], w[24], offset); w[48] = amd_bytealign_S (w[22], w[23], offset); w[47] = amd_bytealign_S (w[21], w[22], offset); w[46] = amd_bytealign_S (w[20], w[21], offset); w[45] = amd_bytealign_S (w[19], w[20], offset); w[44] = amd_bytealign_S (w[18], w[19], offset); w[43] = amd_bytealign_S (w[17], w[18], offset); w[42] = amd_bytealign_S (w[16], w[17], offset); w[41] = amd_bytealign_S (w[15], w[16], offset); w[40] = amd_bytealign_S (w[14], w[15], offset); w[39] = amd_bytealign_S (w[13], w[14], offset); w[38] = amd_bytealign_S (w[12], w[13], offset); w[37] = amd_bytealign_S (w[11], w[12], offset); w[36] = amd_bytealign_S (w[10], w[11], offset); w[35] = amd_bytealign_S (w[ 9], w[10], offset); w[34] = amd_bytealign_S (w[ 8], w[ 9], offset); w[33] = amd_bytealign_S (w[ 7], w[ 8], offset); w[32] = amd_bytealign_S (w[ 6], w[ 7], offset); w[31] = amd_bytealign_S (w[ 5], w[ 6], offset); w[30] = amd_bytealign_S (w[ 4], w[ 5], offset); w[29] = amd_bytealign_S (w[ 3], w[ 4], offset); w[28] = amd_bytealign_S (w[ 2], w[ 3], offset); w[27] = amd_bytealign_S (w[ 1], w[ 2], offset); w[26] = amd_bytealign_S (w[ 0], w[ 1], offset); w[25] = amd_bytealign_S ( 0, w[ 0], offset); w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 26: w[63] = amd_bytealign_S (w[36], w[37], offset); w[62] = amd_bytealign_S (w[35], w[36], offset); w[61] = amd_bytealign_S (w[34], w[35], offset); w[60] = amd_bytealign_S (w[33], w[34], offset); w[59] = amd_bytealign_S (w[32], w[33], offset); w[58] = amd_bytealign_S (w[31], w[32], offset); w[57] = amd_bytealign_S (w[30], w[31], offset); w[56] = amd_bytealign_S (w[29], w[30], offset); w[55] = amd_bytealign_S (w[28], w[29], offset); w[54] = amd_bytealign_S (w[27], w[28], offset); w[53] = amd_bytealign_S (w[26], w[27], offset); w[52] = amd_bytealign_S (w[25], w[26], offset); w[51] = amd_bytealign_S (w[24], w[25], offset); w[50] = amd_bytealign_S (w[23], w[24], offset); w[49] = amd_bytealign_S (w[22], w[23], offset); w[48] = amd_bytealign_S (w[21], w[22], offset); w[47] = amd_bytealign_S (w[20], w[21], offset); w[46] = amd_bytealign_S (w[19], w[20], offset); w[45] = amd_bytealign_S (w[18], w[19], offset); w[44] = amd_bytealign_S (w[17], w[18], offset); w[43] = amd_bytealign_S (w[16], w[17], offset); w[42] = amd_bytealign_S (w[15], w[16], offset); w[41] = amd_bytealign_S (w[14], w[15], offset); w[40] = amd_bytealign_S (w[13], w[14], offset); w[39] = amd_bytealign_S (w[12], w[13], offset); w[38] = amd_bytealign_S (w[11], w[12], offset); w[37] = amd_bytealign_S (w[10], w[11], offset); w[36] = amd_bytealign_S (w[ 9], w[10], offset); w[35] = amd_bytealign_S (w[ 8], w[ 9], offset); w[34] = amd_bytealign_S (w[ 7], w[ 8], offset); w[33] = amd_bytealign_S (w[ 6], w[ 7], offset); w[32] = amd_bytealign_S (w[ 5], w[ 6], offset); w[31] = amd_bytealign_S (w[ 4], w[ 5], offset); w[30] = amd_bytealign_S (w[ 3], w[ 4], offset); w[29] = amd_bytealign_S (w[ 2], w[ 3], offset); w[28] = amd_bytealign_S (w[ 1], w[ 2], offset); w[27] = amd_bytealign_S (w[ 0], w[ 1], offset); w[26] = amd_bytealign_S ( 0, w[ 0], offset); w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 27: w[63] = amd_bytealign_S (w[35], w[36], offset); w[62] = amd_bytealign_S (w[34], w[35], offset); w[61] = amd_bytealign_S (w[33], w[34], offset); w[60] = amd_bytealign_S (w[32], w[33], offset); w[59] = amd_bytealign_S (w[31], w[32], offset); w[58] = amd_bytealign_S (w[30], w[31], offset); w[57] = amd_bytealign_S (w[29], w[30], offset); w[56] = amd_bytealign_S (w[28], w[29], offset); w[55] = amd_bytealign_S (w[27], w[28], offset); w[54] = amd_bytealign_S (w[26], w[27], offset); w[53] = amd_bytealign_S (w[25], w[26], offset); w[52] = amd_bytealign_S (w[24], w[25], offset); w[51] = amd_bytealign_S (w[23], w[24], offset); w[50] = amd_bytealign_S (w[22], w[23], offset); w[49] = amd_bytealign_S (w[21], w[22], offset); w[48] = amd_bytealign_S (w[20], w[21], offset); w[47] = amd_bytealign_S (w[19], w[20], offset); w[46] = amd_bytealign_S (w[18], w[19], offset); w[45] = amd_bytealign_S (w[17], w[18], offset); w[44] = amd_bytealign_S (w[16], w[17], offset); w[43] = amd_bytealign_S (w[15], w[16], offset); w[42] = amd_bytealign_S (w[14], w[15], offset); w[41] = amd_bytealign_S (w[13], w[14], offset); w[40] = amd_bytealign_S (w[12], w[13], offset); w[39] = amd_bytealign_S (w[11], w[12], offset); w[38] = amd_bytealign_S (w[10], w[11], offset); w[37] = amd_bytealign_S (w[ 9], w[10], offset); w[36] = amd_bytealign_S (w[ 8], w[ 9], offset); w[35] = amd_bytealign_S (w[ 7], w[ 8], offset); w[34] = amd_bytealign_S (w[ 6], w[ 7], offset); w[33] = amd_bytealign_S (w[ 5], w[ 6], offset); w[32] = amd_bytealign_S (w[ 4], w[ 5], offset); w[31] = amd_bytealign_S (w[ 3], w[ 4], offset); w[30] = amd_bytealign_S (w[ 2], w[ 3], offset); w[29] = amd_bytealign_S (w[ 1], w[ 2], offset); w[28] = amd_bytealign_S (w[ 0], w[ 1], offset); w[27] = amd_bytealign_S ( 0, w[ 0], offset); w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 28: w[63] = amd_bytealign_S (w[34], w[35], offset); w[62] = amd_bytealign_S (w[33], w[34], offset); w[61] = amd_bytealign_S (w[32], w[33], offset); w[60] = amd_bytealign_S (w[31], w[32], offset); w[59] = amd_bytealign_S (w[30], w[31], offset); w[58] = amd_bytealign_S (w[29], w[30], offset); w[57] = amd_bytealign_S (w[28], w[29], offset); w[56] = amd_bytealign_S (w[27], w[28], offset); w[55] = amd_bytealign_S (w[26], w[27], offset); w[54] = amd_bytealign_S (w[25], w[26], offset); w[53] = amd_bytealign_S (w[24], w[25], offset); w[52] = amd_bytealign_S (w[23], w[24], offset); w[51] = amd_bytealign_S (w[22], w[23], offset); w[50] = amd_bytealign_S (w[21], w[22], offset); w[49] = amd_bytealign_S (w[20], w[21], offset); w[48] = amd_bytealign_S (w[19], w[20], offset); w[47] = amd_bytealign_S (w[18], w[19], offset); w[46] = amd_bytealign_S (w[17], w[18], offset); w[45] = amd_bytealign_S (w[16], w[17], offset); w[44] = amd_bytealign_S (w[15], w[16], offset); w[43] = amd_bytealign_S (w[14], w[15], offset); w[42] = amd_bytealign_S (w[13], w[14], offset); w[41] = amd_bytealign_S (w[12], w[13], offset); w[40] = amd_bytealign_S (w[11], w[12], offset); w[39] = amd_bytealign_S (w[10], w[11], offset); w[38] = amd_bytealign_S (w[ 9], w[10], offset); w[37] = amd_bytealign_S (w[ 8], w[ 9], offset); w[36] = amd_bytealign_S (w[ 7], w[ 8], offset); w[35] = amd_bytealign_S (w[ 6], w[ 7], offset); w[34] = amd_bytealign_S (w[ 5], w[ 6], offset); w[33] = amd_bytealign_S (w[ 4], w[ 5], offset); w[32] = amd_bytealign_S (w[ 3], w[ 4], offset); w[31] = amd_bytealign_S (w[ 2], w[ 3], offset); w[30] = amd_bytealign_S (w[ 1], w[ 2], offset); w[29] = amd_bytealign_S (w[ 0], w[ 1], offset); w[28] = amd_bytealign_S ( 0, w[ 0], offset); w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 29: w[63] = amd_bytealign_S (w[33], w[34], offset); w[62] = amd_bytealign_S (w[32], w[33], offset); w[61] = amd_bytealign_S (w[31], w[32], offset); w[60] = amd_bytealign_S (w[30], w[31], offset); w[59] = amd_bytealign_S (w[29], w[30], offset); w[58] = amd_bytealign_S (w[28], w[29], offset); w[57] = amd_bytealign_S (w[27], w[28], offset); w[56] = amd_bytealign_S (w[26], w[27], offset); w[55] = amd_bytealign_S (w[25], w[26], offset); w[54] = amd_bytealign_S (w[24], w[25], offset); w[53] = amd_bytealign_S (w[23], w[24], offset); w[52] = amd_bytealign_S (w[22], w[23], offset); w[51] = amd_bytealign_S (w[21], w[22], offset); w[50] = amd_bytealign_S (w[20], w[21], offset); w[49] = amd_bytealign_S (w[19], w[20], offset); w[48] = amd_bytealign_S (w[18], w[19], offset); w[47] = amd_bytealign_S (w[17], w[18], offset); w[46] = amd_bytealign_S (w[16], w[17], offset); w[45] = amd_bytealign_S (w[15], w[16], offset); w[44] = amd_bytealign_S (w[14], w[15], offset); w[43] = amd_bytealign_S (w[13], w[14], offset); w[42] = amd_bytealign_S (w[12], w[13], offset); w[41] = amd_bytealign_S (w[11], w[12], offset); w[40] = amd_bytealign_S (w[10], w[11], offset); w[39] = amd_bytealign_S (w[ 9], w[10], offset); w[38] = amd_bytealign_S (w[ 8], w[ 9], offset); w[37] = amd_bytealign_S (w[ 7], w[ 8], offset); w[36] = amd_bytealign_S (w[ 6], w[ 7], offset); w[35] = amd_bytealign_S (w[ 5], w[ 6], offset); w[34] = amd_bytealign_S (w[ 4], w[ 5], offset); w[33] = amd_bytealign_S (w[ 3], w[ 4], offset); w[32] = amd_bytealign_S (w[ 2], w[ 3], offset); w[31] = amd_bytealign_S (w[ 1], w[ 2], offset); w[30] = amd_bytealign_S (w[ 0], w[ 1], offset); w[29] = amd_bytealign_S ( 0, w[ 0], offset); w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 30: w[63] = amd_bytealign_S (w[32], w[33], offset); w[62] = amd_bytealign_S (w[31], w[32], offset); w[61] = amd_bytealign_S (w[30], w[31], offset); w[60] = amd_bytealign_S (w[29], w[30], offset); w[59] = amd_bytealign_S (w[28], w[29], offset); w[58] = amd_bytealign_S (w[27], w[28], offset); w[57] = amd_bytealign_S (w[26], w[27], offset); w[56] = amd_bytealign_S (w[25], w[26], offset); w[55] = amd_bytealign_S (w[24], w[25], offset); w[54] = amd_bytealign_S (w[23], w[24], offset); w[53] = amd_bytealign_S (w[22], w[23], offset); w[52] = amd_bytealign_S (w[21], w[22], offset); w[51] = amd_bytealign_S (w[20], w[21], offset); w[50] = amd_bytealign_S (w[19], w[20], offset); w[49] = amd_bytealign_S (w[18], w[19], offset); w[48] = amd_bytealign_S (w[17], w[18], offset); w[47] = amd_bytealign_S (w[16], w[17], offset); w[46] = amd_bytealign_S (w[15], w[16], offset); w[45] = amd_bytealign_S (w[14], w[15], offset); w[44] = amd_bytealign_S (w[13], w[14], offset); w[43] = amd_bytealign_S (w[12], w[13], offset); w[42] = amd_bytealign_S (w[11], w[12], offset); w[41] = amd_bytealign_S (w[10], w[11], offset); w[40] = amd_bytealign_S (w[ 9], w[10], offset); w[39] = amd_bytealign_S (w[ 8], w[ 9], offset); w[38] = amd_bytealign_S (w[ 7], w[ 8], offset); w[37] = amd_bytealign_S (w[ 6], w[ 7], offset); w[36] = amd_bytealign_S (w[ 5], w[ 6], offset); w[35] = amd_bytealign_S (w[ 4], w[ 5], offset); w[34] = amd_bytealign_S (w[ 3], w[ 4], offset); w[33] = amd_bytealign_S (w[ 2], w[ 3], offset); w[32] = amd_bytealign_S (w[ 1], w[ 2], offset); w[31] = amd_bytealign_S (w[ 0], w[ 1], offset); w[30] = amd_bytealign_S ( 0, w[ 0], offset); w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 31: w[63] = amd_bytealign_S (w[31], w[32], offset); w[62] = amd_bytealign_S (w[30], w[31], offset); w[61] = amd_bytealign_S (w[29], w[30], offset); w[60] = amd_bytealign_S (w[28], w[29], offset); w[59] = amd_bytealign_S (w[27], w[28], offset); w[58] = amd_bytealign_S (w[26], w[27], offset); w[57] = amd_bytealign_S (w[25], w[26], offset); w[56] = amd_bytealign_S (w[24], w[25], offset); w[55] = amd_bytealign_S (w[23], w[24], offset); w[54] = amd_bytealign_S (w[22], w[23], offset); w[53] = amd_bytealign_S (w[21], w[22], offset); w[52] = amd_bytealign_S (w[20], w[21], offset); w[51] = amd_bytealign_S (w[19], w[20], offset); w[50] = amd_bytealign_S (w[18], w[19], offset); w[49] = amd_bytealign_S (w[17], w[18], offset); w[48] = amd_bytealign_S (w[16], w[17], offset); w[47] = amd_bytealign_S (w[15], w[16], offset); w[46] = amd_bytealign_S (w[14], w[15], offset); w[45] = amd_bytealign_S (w[13], w[14], offset); w[44] = amd_bytealign_S (w[12], w[13], offset); w[43] = amd_bytealign_S (w[11], w[12], offset); w[42] = amd_bytealign_S (w[10], w[11], offset); w[41] = amd_bytealign_S (w[ 9], w[10], offset); w[40] = amd_bytealign_S (w[ 8], w[ 9], offset); w[39] = amd_bytealign_S (w[ 7], w[ 8], offset); w[38] = amd_bytealign_S (w[ 6], w[ 7], offset); w[37] = amd_bytealign_S (w[ 5], w[ 6], offset); w[36] = amd_bytealign_S (w[ 4], w[ 5], offset); w[35] = amd_bytealign_S (w[ 3], w[ 4], offset); w[34] = amd_bytealign_S (w[ 2], w[ 3], offset); w[33] = amd_bytealign_S (w[ 1], w[ 2], offset); w[32] = amd_bytealign_S (w[ 0], w[ 1], offset); w[31] = amd_bytealign_S ( 0, w[ 0], offset); w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 32: w[63] = amd_bytealign_S (w[30], w[31], offset); w[62] = amd_bytealign_S (w[29], w[30], offset); w[61] = amd_bytealign_S (w[28], w[29], offset); w[60] = amd_bytealign_S (w[27], w[28], offset); w[59] = amd_bytealign_S (w[26], w[27], offset); w[58] = amd_bytealign_S (w[25], w[26], offset); w[57] = amd_bytealign_S (w[24], w[25], offset); w[56] = amd_bytealign_S (w[23], w[24], offset); w[55] = amd_bytealign_S (w[22], w[23], offset); w[54] = amd_bytealign_S (w[21], w[22], offset); w[53] = amd_bytealign_S (w[20], w[21], offset); w[52] = amd_bytealign_S (w[19], w[20], offset); w[51] = amd_bytealign_S (w[18], w[19], offset); w[50] = amd_bytealign_S (w[17], w[18], offset); w[49] = amd_bytealign_S (w[16], w[17], offset); w[48] = amd_bytealign_S (w[15], w[16], offset); w[47] = amd_bytealign_S (w[14], w[15], offset); w[46] = amd_bytealign_S (w[13], w[14], offset); w[45] = amd_bytealign_S (w[12], w[13], offset); w[44] = amd_bytealign_S (w[11], w[12], offset); w[43] = amd_bytealign_S (w[10], w[11], offset); w[42] = amd_bytealign_S (w[ 9], w[10], offset); w[41] = amd_bytealign_S (w[ 8], w[ 9], offset); w[40] = amd_bytealign_S (w[ 7], w[ 8], offset); w[39] = amd_bytealign_S (w[ 6], w[ 7], offset); w[38] = amd_bytealign_S (w[ 5], w[ 6], offset); w[37] = amd_bytealign_S (w[ 4], w[ 5], offset); w[36] = amd_bytealign_S (w[ 3], w[ 4], offset); w[35] = amd_bytealign_S (w[ 2], w[ 3], offset); w[34] = amd_bytealign_S (w[ 1], w[ 2], offset); w[33] = amd_bytealign_S (w[ 0], w[ 1], offset); w[32] = amd_bytealign_S ( 0, w[ 0], offset); w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 33: w[63] = amd_bytealign_S (w[29], w[30], offset); w[62] = amd_bytealign_S (w[28], w[29], offset); w[61] = amd_bytealign_S (w[27], w[28], offset); w[60] = amd_bytealign_S (w[26], w[27], offset); w[59] = amd_bytealign_S (w[25], w[26], offset); w[58] = amd_bytealign_S (w[24], w[25], offset); w[57] = amd_bytealign_S (w[23], w[24], offset); w[56] = amd_bytealign_S (w[22], w[23], offset); w[55] = amd_bytealign_S (w[21], w[22], offset); w[54] = amd_bytealign_S (w[20], w[21], offset); w[53] = amd_bytealign_S (w[19], w[20], offset); w[52] = amd_bytealign_S (w[18], w[19], offset); w[51] = amd_bytealign_S (w[17], w[18], offset); w[50] = amd_bytealign_S (w[16], w[17], offset); w[49] = amd_bytealign_S (w[15], w[16], offset); w[48] = amd_bytealign_S (w[14], w[15], offset); w[47] = amd_bytealign_S (w[13], w[14], offset); w[46] = amd_bytealign_S (w[12], w[13], offset); w[45] = amd_bytealign_S (w[11], w[12], offset); w[44] = amd_bytealign_S (w[10], w[11], offset); w[43] = amd_bytealign_S (w[ 9], w[10], offset); w[42] = amd_bytealign_S (w[ 8], w[ 9], offset); w[41] = amd_bytealign_S (w[ 7], w[ 8], offset); w[40] = amd_bytealign_S (w[ 6], w[ 7], offset); w[39] = amd_bytealign_S (w[ 5], w[ 6], offset); w[38] = amd_bytealign_S (w[ 4], w[ 5], offset); w[37] = amd_bytealign_S (w[ 3], w[ 4], offset); w[36] = amd_bytealign_S (w[ 2], w[ 3], offset); w[35] = amd_bytealign_S (w[ 1], w[ 2], offset); w[34] = amd_bytealign_S (w[ 0], w[ 1], offset); w[33] = amd_bytealign_S ( 0, w[ 0], offset); w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 34: w[63] = amd_bytealign_S (w[28], w[29], offset); w[62] = amd_bytealign_S (w[27], w[28], offset); w[61] = amd_bytealign_S (w[26], w[27], offset); w[60] = amd_bytealign_S (w[25], w[26], offset); w[59] = amd_bytealign_S (w[24], w[25], offset); w[58] = amd_bytealign_S (w[23], w[24], offset); w[57] = amd_bytealign_S (w[22], w[23], offset); w[56] = amd_bytealign_S (w[21], w[22], offset); w[55] = amd_bytealign_S (w[20], w[21], offset); w[54] = amd_bytealign_S (w[19], w[20], offset); w[53] = amd_bytealign_S (w[18], w[19], offset); w[52] = amd_bytealign_S (w[17], w[18], offset); w[51] = amd_bytealign_S (w[16], w[17], offset); w[50] = amd_bytealign_S (w[15], w[16], offset); w[49] = amd_bytealign_S (w[14], w[15], offset); w[48] = amd_bytealign_S (w[13], w[14], offset); w[47] = amd_bytealign_S (w[12], w[13], offset); w[46] = amd_bytealign_S (w[11], w[12], offset); w[45] = amd_bytealign_S (w[10], w[11], offset); w[44] = amd_bytealign_S (w[ 9], w[10], offset); w[43] = amd_bytealign_S (w[ 8], w[ 9], offset); w[42] = amd_bytealign_S (w[ 7], w[ 8], offset); w[41] = amd_bytealign_S (w[ 6], w[ 7], offset); w[40] = amd_bytealign_S (w[ 5], w[ 6], offset); w[39] = amd_bytealign_S (w[ 4], w[ 5], offset); w[38] = amd_bytealign_S (w[ 3], w[ 4], offset); w[37] = amd_bytealign_S (w[ 2], w[ 3], offset); w[36] = amd_bytealign_S (w[ 1], w[ 2], offset); w[35] = amd_bytealign_S (w[ 0], w[ 1], offset); w[34] = amd_bytealign_S ( 0, w[ 0], offset); w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 35: w[63] = amd_bytealign_S (w[27], w[28], offset); w[62] = amd_bytealign_S (w[26], w[27], offset); w[61] = amd_bytealign_S (w[25], w[26], offset); w[60] = amd_bytealign_S (w[24], w[25], offset); w[59] = amd_bytealign_S (w[23], w[24], offset); w[58] = amd_bytealign_S (w[22], w[23], offset); w[57] = amd_bytealign_S (w[21], w[22], offset); w[56] = amd_bytealign_S (w[20], w[21], offset); w[55] = amd_bytealign_S (w[19], w[20], offset); w[54] = amd_bytealign_S (w[18], w[19], offset); w[53] = amd_bytealign_S (w[17], w[18], offset); w[52] = amd_bytealign_S (w[16], w[17], offset); w[51] = amd_bytealign_S (w[15], w[16], offset); w[50] = amd_bytealign_S (w[14], w[15], offset); w[49] = amd_bytealign_S (w[13], w[14], offset); w[48] = amd_bytealign_S (w[12], w[13], offset); w[47] = amd_bytealign_S (w[11], w[12], offset); w[46] = amd_bytealign_S (w[10], w[11], offset); w[45] = amd_bytealign_S (w[ 9], w[10], offset); w[44] = amd_bytealign_S (w[ 8], w[ 9], offset); w[43] = amd_bytealign_S (w[ 7], w[ 8], offset); w[42] = amd_bytealign_S (w[ 6], w[ 7], offset); w[41] = amd_bytealign_S (w[ 5], w[ 6], offset); w[40] = amd_bytealign_S (w[ 4], w[ 5], offset); w[39] = amd_bytealign_S (w[ 3], w[ 4], offset); w[38] = amd_bytealign_S (w[ 2], w[ 3], offset); w[37] = amd_bytealign_S (w[ 1], w[ 2], offset); w[36] = amd_bytealign_S (w[ 0], w[ 1], offset); w[35] = amd_bytealign_S ( 0, w[ 0], offset); w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 36: w[63] = amd_bytealign_S (w[26], w[27], offset); w[62] = amd_bytealign_S (w[25], w[26], offset); w[61] = amd_bytealign_S (w[24], w[25], offset); w[60] = amd_bytealign_S (w[23], w[24], offset); w[59] = amd_bytealign_S (w[22], w[23], offset); w[58] = amd_bytealign_S (w[21], w[22], offset); w[57] = amd_bytealign_S (w[20], w[21], offset); w[56] = amd_bytealign_S (w[19], w[20], offset); w[55] = amd_bytealign_S (w[18], w[19], offset); w[54] = amd_bytealign_S (w[17], w[18], offset); w[53] = amd_bytealign_S (w[16], w[17], offset); w[52] = amd_bytealign_S (w[15], w[16], offset); w[51] = amd_bytealign_S (w[14], w[15], offset); w[50] = amd_bytealign_S (w[13], w[14], offset); w[49] = amd_bytealign_S (w[12], w[13], offset); w[48] = amd_bytealign_S (w[11], w[12], offset); w[47] = amd_bytealign_S (w[10], w[11], offset); w[46] = amd_bytealign_S (w[ 9], w[10], offset); w[45] = amd_bytealign_S (w[ 8], w[ 9], offset); w[44] = amd_bytealign_S (w[ 7], w[ 8], offset); w[43] = amd_bytealign_S (w[ 6], w[ 7], offset); w[42] = amd_bytealign_S (w[ 5], w[ 6], offset); w[41] = amd_bytealign_S (w[ 4], w[ 5], offset); w[40] = amd_bytealign_S (w[ 3], w[ 4], offset); w[39] = amd_bytealign_S (w[ 2], w[ 3], offset); w[38] = amd_bytealign_S (w[ 1], w[ 2], offset); w[37] = amd_bytealign_S (w[ 0], w[ 1], offset); w[36] = amd_bytealign_S ( 0, w[ 0], offset); w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 37: w[63] = amd_bytealign_S (w[25], w[26], offset); w[62] = amd_bytealign_S (w[24], w[25], offset); w[61] = amd_bytealign_S (w[23], w[24], offset); w[60] = amd_bytealign_S (w[22], w[23], offset); w[59] = amd_bytealign_S (w[21], w[22], offset); w[58] = amd_bytealign_S (w[20], w[21], offset); w[57] = amd_bytealign_S (w[19], w[20], offset); w[56] = amd_bytealign_S (w[18], w[19], offset); w[55] = amd_bytealign_S (w[17], w[18], offset); w[54] = amd_bytealign_S (w[16], w[17], offset); w[53] = amd_bytealign_S (w[15], w[16], offset); w[52] = amd_bytealign_S (w[14], w[15], offset); w[51] = amd_bytealign_S (w[13], w[14], offset); w[50] = amd_bytealign_S (w[12], w[13], offset); w[49] = amd_bytealign_S (w[11], w[12], offset); w[48] = amd_bytealign_S (w[10], w[11], offset); w[47] = amd_bytealign_S (w[ 9], w[10], offset); w[46] = amd_bytealign_S (w[ 8], w[ 9], offset); w[45] = amd_bytealign_S (w[ 7], w[ 8], offset); w[44] = amd_bytealign_S (w[ 6], w[ 7], offset); w[43] = amd_bytealign_S (w[ 5], w[ 6], offset); w[42] = amd_bytealign_S (w[ 4], w[ 5], offset); w[41] = amd_bytealign_S (w[ 3], w[ 4], offset); w[40] = amd_bytealign_S (w[ 2], w[ 3], offset); w[39] = amd_bytealign_S (w[ 1], w[ 2], offset); w[38] = amd_bytealign_S (w[ 0], w[ 1], offset); w[37] = amd_bytealign_S ( 0, w[ 0], offset); w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 38: w[63] = amd_bytealign_S (w[24], w[25], offset); w[62] = amd_bytealign_S (w[23], w[24], offset); w[61] = amd_bytealign_S (w[22], w[23], offset); w[60] = amd_bytealign_S (w[21], w[22], offset); w[59] = amd_bytealign_S (w[20], w[21], offset); w[58] = amd_bytealign_S (w[19], w[20], offset); w[57] = amd_bytealign_S (w[18], w[19], offset); w[56] = amd_bytealign_S (w[17], w[18], offset); w[55] = amd_bytealign_S (w[16], w[17], offset); w[54] = amd_bytealign_S (w[15], w[16], offset); w[53] = amd_bytealign_S (w[14], w[15], offset); w[52] = amd_bytealign_S (w[13], w[14], offset); w[51] = amd_bytealign_S (w[12], w[13], offset); w[50] = amd_bytealign_S (w[11], w[12], offset); w[49] = amd_bytealign_S (w[10], w[11], offset); w[48] = amd_bytealign_S (w[ 9], w[10], offset); w[47] = amd_bytealign_S (w[ 8], w[ 9], offset); w[46] = amd_bytealign_S (w[ 7], w[ 8], offset); w[45] = amd_bytealign_S (w[ 6], w[ 7], offset); w[44] = amd_bytealign_S (w[ 5], w[ 6], offset); w[43] = amd_bytealign_S (w[ 4], w[ 5], offset); w[42] = amd_bytealign_S (w[ 3], w[ 4], offset); w[41] = amd_bytealign_S (w[ 2], w[ 3], offset); w[40] = amd_bytealign_S (w[ 1], w[ 2], offset); w[39] = amd_bytealign_S (w[ 0], w[ 1], offset); w[38] = amd_bytealign_S ( 0, w[ 0], offset); w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 39: w[63] = amd_bytealign_S (w[23], w[24], offset); w[62] = amd_bytealign_S (w[22], w[23], offset); w[61] = amd_bytealign_S (w[21], w[22], offset); w[60] = amd_bytealign_S (w[20], w[21], offset); w[59] = amd_bytealign_S (w[19], w[20], offset); w[58] = amd_bytealign_S (w[18], w[19], offset); w[57] = amd_bytealign_S (w[17], w[18], offset); w[56] = amd_bytealign_S (w[16], w[17], offset); w[55] = amd_bytealign_S (w[15], w[16], offset); w[54] = amd_bytealign_S (w[14], w[15], offset); w[53] = amd_bytealign_S (w[13], w[14], offset); w[52] = amd_bytealign_S (w[12], w[13], offset); w[51] = amd_bytealign_S (w[11], w[12], offset); w[50] = amd_bytealign_S (w[10], w[11], offset); w[49] = amd_bytealign_S (w[ 9], w[10], offset); w[48] = amd_bytealign_S (w[ 8], w[ 9], offset); w[47] = amd_bytealign_S (w[ 7], w[ 8], offset); w[46] = amd_bytealign_S (w[ 6], w[ 7], offset); w[45] = amd_bytealign_S (w[ 5], w[ 6], offset); w[44] = amd_bytealign_S (w[ 4], w[ 5], offset); w[43] = amd_bytealign_S (w[ 3], w[ 4], offset); w[42] = amd_bytealign_S (w[ 2], w[ 3], offset); w[41] = amd_bytealign_S (w[ 1], w[ 2], offset); w[40] = amd_bytealign_S (w[ 0], w[ 1], offset); w[39] = amd_bytealign_S ( 0, w[ 0], offset); w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 40: w[63] = amd_bytealign_S (w[22], w[23], offset); w[62] = amd_bytealign_S (w[21], w[22], offset); w[61] = amd_bytealign_S (w[20], w[21], offset); w[60] = amd_bytealign_S (w[19], w[20], offset); w[59] = amd_bytealign_S (w[18], w[19], offset); w[58] = amd_bytealign_S (w[17], w[18], offset); w[57] = amd_bytealign_S (w[16], w[17], offset); w[56] = amd_bytealign_S (w[15], w[16], offset); w[55] = amd_bytealign_S (w[14], w[15], offset); w[54] = amd_bytealign_S (w[13], w[14], offset); w[53] = amd_bytealign_S (w[12], w[13], offset); w[52] = amd_bytealign_S (w[11], w[12], offset); w[51] = amd_bytealign_S (w[10], w[11], offset); w[50] = amd_bytealign_S (w[ 9], w[10], offset); w[49] = amd_bytealign_S (w[ 8], w[ 9], offset); w[48] = amd_bytealign_S (w[ 7], w[ 8], offset); w[47] = amd_bytealign_S (w[ 6], w[ 7], offset); w[46] = amd_bytealign_S (w[ 5], w[ 6], offset); w[45] = amd_bytealign_S (w[ 4], w[ 5], offset); w[44] = amd_bytealign_S (w[ 3], w[ 4], offset); w[43] = amd_bytealign_S (w[ 2], w[ 3], offset); w[42] = amd_bytealign_S (w[ 1], w[ 2], offset); w[41] = amd_bytealign_S (w[ 0], w[ 1], offset); w[40] = amd_bytealign_S ( 0, w[ 0], offset); w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 41: w[63] = amd_bytealign_S (w[21], w[22], offset); w[62] = amd_bytealign_S (w[20], w[21], offset); w[61] = amd_bytealign_S (w[19], w[20], offset); w[60] = amd_bytealign_S (w[18], w[19], offset); w[59] = amd_bytealign_S (w[17], w[18], offset); w[58] = amd_bytealign_S (w[16], w[17], offset); w[57] = amd_bytealign_S (w[15], w[16], offset); w[56] = amd_bytealign_S (w[14], w[15], offset); w[55] = amd_bytealign_S (w[13], w[14], offset); w[54] = amd_bytealign_S (w[12], w[13], offset); w[53] = amd_bytealign_S (w[11], w[12], offset); w[52] = amd_bytealign_S (w[10], w[11], offset); w[51] = amd_bytealign_S (w[ 9], w[10], offset); w[50] = amd_bytealign_S (w[ 8], w[ 9], offset); w[49] = amd_bytealign_S (w[ 7], w[ 8], offset); w[48] = amd_bytealign_S (w[ 6], w[ 7], offset); w[47] = amd_bytealign_S (w[ 5], w[ 6], offset); w[46] = amd_bytealign_S (w[ 4], w[ 5], offset); w[45] = amd_bytealign_S (w[ 3], w[ 4], offset); w[44] = amd_bytealign_S (w[ 2], w[ 3], offset); w[43] = amd_bytealign_S (w[ 1], w[ 2], offset); w[42] = amd_bytealign_S (w[ 0], w[ 1], offset); w[41] = amd_bytealign_S ( 0, w[ 0], offset); w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 42: w[63] = amd_bytealign_S (w[20], w[21], offset); w[62] = amd_bytealign_S (w[19], w[20], offset); w[61] = amd_bytealign_S (w[18], w[19], offset); w[60] = amd_bytealign_S (w[17], w[18], offset); w[59] = amd_bytealign_S (w[16], w[17], offset); w[58] = amd_bytealign_S (w[15], w[16], offset); w[57] = amd_bytealign_S (w[14], w[15], offset); w[56] = amd_bytealign_S (w[13], w[14], offset); w[55] = amd_bytealign_S (w[12], w[13], offset); w[54] = amd_bytealign_S (w[11], w[12], offset); w[53] = amd_bytealign_S (w[10], w[11], offset); w[52] = amd_bytealign_S (w[ 9], w[10], offset); w[51] = amd_bytealign_S (w[ 8], w[ 9], offset); w[50] = amd_bytealign_S (w[ 7], w[ 8], offset); w[49] = amd_bytealign_S (w[ 6], w[ 7], offset); w[48] = amd_bytealign_S (w[ 5], w[ 6], offset); w[47] = amd_bytealign_S (w[ 4], w[ 5], offset); w[46] = amd_bytealign_S (w[ 3], w[ 4], offset); w[45] = amd_bytealign_S (w[ 2], w[ 3], offset); w[44] = amd_bytealign_S (w[ 1], w[ 2], offset); w[43] = amd_bytealign_S (w[ 0], w[ 1], offset); w[42] = amd_bytealign_S ( 0, w[ 0], offset); w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 43: w[63] = amd_bytealign_S (w[19], w[20], offset); w[62] = amd_bytealign_S (w[18], w[19], offset); w[61] = amd_bytealign_S (w[17], w[18], offset); w[60] = amd_bytealign_S (w[16], w[17], offset); w[59] = amd_bytealign_S (w[15], w[16], offset); w[58] = amd_bytealign_S (w[14], w[15], offset); w[57] = amd_bytealign_S (w[13], w[14], offset); w[56] = amd_bytealign_S (w[12], w[13], offset); w[55] = amd_bytealign_S (w[11], w[12], offset); w[54] = amd_bytealign_S (w[10], w[11], offset); w[53] = amd_bytealign_S (w[ 9], w[10], offset); w[52] = amd_bytealign_S (w[ 8], w[ 9], offset); w[51] = amd_bytealign_S (w[ 7], w[ 8], offset); w[50] = amd_bytealign_S (w[ 6], w[ 7], offset); w[49] = amd_bytealign_S (w[ 5], w[ 6], offset); w[48] = amd_bytealign_S (w[ 4], w[ 5], offset); w[47] = amd_bytealign_S (w[ 3], w[ 4], offset); w[46] = amd_bytealign_S (w[ 2], w[ 3], offset); w[45] = amd_bytealign_S (w[ 1], w[ 2], offset); w[44] = amd_bytealign_S (w[ 0], w[ 1], offset); w[43] = amd_bytealign_S ( 0, w[ 0], offset); w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 44: w[63] = amd_bytealign_S (w[18], w[19], offset); w[62] = amd_bytealign_S (w[17], w[18], offset); w[61] = amd_bytealign_S (w[16], w[17], offset); w[60] = amd_bytealign_S (w[15], w[16], offset); w[59] = amd_bytealign_S (w[14], w[15], offset); w[58] = amd_bytealign_S (w[13], w[14], offset); w[57] = amd_bytealign_S (w[12], w[13], offset); w[56] = amd_bytealign_S (w[11], w[12], offset); w[55] = amd_bytealign_S (w[10], w[11], offset); w[54] = amd_bytealign_S (w[ 9], w[10], offset); w[53] = amd_bytealign_S (w[ 8], w[ 9], offset); w[52] = amd_bytealign_S (w[ 7], w[ 8], offset); w[51] = amd_bytealign_S (w[ 6], w[ 7], offset); w[50] = amd_bytealign_S (w[ 5], w[ 6], offset); w[49] = amd_bytealign_S (w[ 4], w[ 5], offset); w[48] = amd_bytealign_S (w[ 3], w[ 4], offset); w[47] = amd_bytealign_S (w[ 2], w[ 3], offset); w[46] = amd_bytealign_S (w[ 1], w[ 2], offset); w[45] = amd_bytealign_S (w[ 0], w[ 1], offset); w[44] = amd_bytealign_S ( 0, w[ 0], offset); w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 45: w[63] = amd_bytealign_S (w[17], w[18], offset); w[62] = amd_bytealign_S (w[16], w[17], offset); w[61] = amd_bytealign_S (w[15], w[16], offset); w[60] = amd_bytealign_S (w[14], w[15], offset); w[59] = amd_bytealign_S (w[13], w[14], offset); w[58] = amd_bytealign_S (w[12], w[13], offset); w[57] = amd_bytealign_S (w[11], w[12], offset); w[56] = amd_bytealign_S (w[10], w[11], offset); w[55] = amd_bytealign_S (w[ 9], w[10], offset); w[54] = amd_bytealign_S (w[ 8], w[ 9], offset); w[53] = amd_bytealign_S (w[ 7], w[ 8], offset); w[52] = amd_bytealign_S (w[ 6], w[ 7], offset); w[51] = amd_bytealign_S (w[ 5], w[ 6], offset); w[50] = amd_bytealign_S (w[ 4], w[ 5], offset); w[49] = amd_bytealign_S (w[ 3], w[ 4], offset); w[48] = amd_bytealign_S (w[ 2], w[ 3], offset); w[47] = amd_bytealign_S (w[ 1], w[ 2], offset); w[46] = amd_bytealign_S (w[ 0], w[ 1], offset); w[45] = amd_bytealign_S ( 0, w[ 0], offset); w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 46: w[63] = amd_bytealign_S (w[16], w[17], offset); w[62] = amd_bytealign_S (w[15], w[16], offset); w[61] = amd_bytealign_S (w[14], w[15], offset); w[60] = amd_bytealign_S (w[13], w[14], offset); w[59] = amd_bytealign_S (w[12], w[13], offset); w[58] = amd_bytealign_S (w[11], w[12], offset); w[57] = amd_bytealign_S (w[10], w[11], offset); w[56] = amd_bytealign_S (w[ 9], w[10], offset); w[55] = amd_bytealign_S (w[ 8], w[ 9], offset); w[54] = amd_bytealign_S (w[ 7], w[ 8], offset); w[53] = amd_bytealign_S (w[ 6], w[ 7], offset); w[52] = amd_bytealign_S (w[ 5], w[ 6], offset); w[51] = amd_bytealign_S (w[ 4], w[ 5], offset); w[50] = amd_bytealign_S (w[ 3], w[ 4], offset); w[49] = amd_bytealign_S (w[ 2], w[ 3], offset); w[48] = amd_bytealign_S (w[ 1], w[ 2], offset); w[47] = amd_bytealign_S (w[ 0], w[ 1], offset); w[46] = amd_bytealign_S ( 0, w[ 0], offset); w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 47: w[63] = amd_bytealign_S (w[15], w[16], offset); w[62] = amd_bytealign_S (w[14], w[15], offset); w[61] = amd_bytealign_S (w[13], w[14], offset); w[60] = amd_bytealign_S (w[12], w[13], offset); w[59] = amd_bytealign_S (w[11], w[12], offset); w[58] = amd_bytealign_S (w[10], w[11], offset); w[57] = amd_bytealign_S (w[ 9], w[10], offset); w[56] = amd_bytealign_S (w[ 8], w[ 9], offset); w[55] = amd_bytealign_S (w[ 7], w[ 8], offset); w[54] = amd_bytealign_S (w[ 6], w[ 7], offset); w[53] = amd_bytealign_S (w[ 5], w[ 6], offset); w[52] = amd_bytealign_S (w[ 4], w[ 5], offset); w[51] = amd_bytealign_S (w[ 3], w[ 4], offset); w[50] = amd_bytealign_S (w[ 2], w[ 3], offset); w[49] = amd_bytealign_S (w[ 1], w[ 2], offset); w[48] = amd_bytealign_S (w[ 0], w[ 1], offset); w[47] = amd_bytealign_S ( 0, w[ 0], offset); w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 48: w[63] = amd_bytealign_S (w[14], w[15], offset); w[62] = amd_bytealign_S (w[13], w[14], offset); w[61] = amd_bytealign_S (w[12], w[13], offset); w[60] = amd_bytealign_S (w[11], w[12], offset); w[59] = amd_bytealign_S (w[10], w[11], offset); w[58] = amd_bytealign_S (w[ 9], w[10], offset); w[57] = amd_bytealign_S (w[ 8], w[ 9], offset); w[56] = amd_bytealign_S (w[ 7], w[ 8], offset); w[55] = amd_bytealign_S (w[ 6], w[ 7], offset); w[54] = amd_bytealign_S (w[ 5], w[ 6], offset); w[53] = amd_bytealign_S (w[ 4], w[ 5], offset); w[52] = amd_bytealign_S (w[ 3], w[ 4], offset); w[51] = amd_bytealign_S (w[ 2], w[ 3], offset); w[50] = amd_bytealign_S (w[ 1], w[ 2], offset); w[49] = amd_bytealign_S (w[ 0], w[ 1], offset); w[48] = amd_bytealign_S ( 0, w[ 0], offset); w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 49: w[63] = amd_bytealign_S (w[13], w[14], offset); w[62] = amd_bytealign_S (w[12], w[13], offset); w[61] = amd_bytealign_S (w[11], w[12], offset); w[60] = amd_bytealign_S (w[10], w[11], offset); w[59] = amd_bytealign_S (w[ 9], w[10], offset); w[58] = amd_bytealign_S (w[ 8], w[ 9], offset); w[57] = amd_bytealign_S (w[ 7], w[ 8], offset); w[56] = amd_bytealign_S (w[ 6], w[ 7], offset); w[55] = amd_bytealign_S (w[ 5], w[ 6], offset); w[54] = amd_bytealign_S (w[ 4], w[ 5], offset); w[53] = amd_bytealign_S (w[ 3], w[ 4], offset); w[52] = amd_bytealign_S (w[ 2], w[ 3], offset); w[51] = amd_bytealign_S (w[ 1], w[ 2], offset); w[50] = amd_bytealign_S (w[ 0], w[ 1], offset); w[49] = amd_bytealign_S ( 0, w[ 0], offset); w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 50: w[63] = amd_bytealign_S (w[12], w[13], offset); w[62] = amd_bytealign_S (w[11], w[12], offset); w[61] = amd_bytealign_S (w[10], w[11], offset); w[60] = amd_bytealign_S (w[ 9], w[10], offset); w[59] = amd_bytealign_S (w[ 8], w[ 9], offset); w[58] = amd_bytealign_S (w[ 7], w[ 8], offset); w[57] = amd_bytealign_S (w[ 6], w[ 7], offset); w[56] = amd_bytealign_S (w[ 5], w[ 6], offset); w[55] = amd_bytealign_S (w[ 4], w[ 5], offset); w[54] = amd_bytealign_S (w[ 3], w[ 4], offset); w[53] = amd_bytealign_S (w[ 2], w[ 3], offset); w[52] = amd_bytealign_S (w[ 1], w[ 2], offset); w[51] = amd_bytealign_S (w[ 0], w[ 1], offset); w[50] = amd_bytealign_S ( 0, w[ 0], offset); w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 51: w[63] = amd_bytealign_S (w[11], w[12], offset); w[62] = amd_bytealign_S (w[10], w[11], offset); w[61] = amd_bytealign_S (w[ 9], w[10], offset); w[60] = amd_bytealign_S (w[ 8], w[ 9], offset); w[59] = amd_bytealign_S (w[ 7], w[ 8], offset); w[58] = amd_bytealign_S (w[ 6], w[ 7], offset); w[57] = amd_bytealign_S (w[ 5], w[ 6], offset); w[56] = amd_bytealign_S (w[ 4], w[ 5], offset); w[55] = amd_bytealign_S (w[ 3], w[ 4], offset); w[54] = amd_bytealign_S (w[ 2], w[ 3], offset); w[53] = amd_bytealign_S (w[ 1], w[ 2], offset); w[52] = amd_bytealign_S (w[ 0], w[ 1], offset); w[51] = amd_bytealign_S ( 0, w[ 0], offset); w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 52: w[63] = amd_bytealign_S (w[10], w[11], offset); w[62] = amd_bytealign_S (w[ 9], w[10], offset); w[61] = amd_bytealign_S (w[ 8], w[ 9], offset); w[60] = amd_bytealign_S (w[ 7], w[ 8], offset); w[59] = amd_bytealign_S (w[ 6], w[ 7], offset); w[58] = amd_bytealign_S (w[ 5], w[ 6], offset); w[57] = amd_bytealign_S (w[ 4], w[ 5], offset); w[56] = amd_bytealign_S (w[ 3], w[ 4], offset); w[55] = amd_bytealign_S (w[ 2], w[ 3], offset); w[54] = amd_bytealign_S (w[ 1], w[ 2], offset); w[53] = amd_bytealign_S (w[ 0], w[ 1], offset); w[52] = amd_bytealign_S ( 0, w[ 0], offset); w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 53: w[63] = amd_bytealign_S (w[ 9], w[10], offset); w[62] = amd_bytealign_S (w[ 8], w[ 9], offset); w[61] = amd_bytealign_S (w[ 7], w[ 8], offset); w[60] = amd_bytealign_S (w[ 6], w[ 7], offset); w[59] = amd_bytealign_S (w[ 5], w[ 6], offset); w[58] = amd_bytealign_S (w[ 4], w[ 5], offset); w[57] = amd_bytealign_S (w[ 3], w[ 4], offset); w[56] = amd_bytealign_S (w[ 2], w[ 3], offset); w[55] = amd_bytealign_S (w[ 1], w[ 2], offset); w[54] = amd_bytealign_S (w[ 0], w[ 1], offset); w[53] = amd_bytealign_S ( 0, w[ 0], offset); w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 54: w[63] = amd_bytealign_S (w[ 8], w[ 9], offset); w[62] = amd_bytealign_S (w[ 7], w[ 8], offset); w[61] = amd_bytealign_S (w[ 6], w[ 7], offset); w[60] = amd_bytealign_S (w[ 5], w[ 6], offset); w[59] = amd_bytealign_S (w[ 4], w[ 5], offset); w[58] = amd_bytealign_S (w[ 3], w[ 4], offset); w[57] = amd_bytealign_S (w[ 2], w[ 3], offset); w[56] = amd_bytealign_S (w[ 1], w[ 2], offset); w[55] = amd_bytealign_S (w[ 0], w[ 1], offset); w[54] = amd_bytealign_S ( 0, w[ 0], offset); w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 55: w[63] = amd_bytealign_S (w[ 7], w[ 8], offset); w[62] = amd_bytealign_S (w[ 6], w[ 7], offset); w[61] = amd_bytealign_S (w[ 5], w[ 6], offset); w[60] = amd_bytealign_S (w[ 4], w[ 5], offset); w[59] = amd_bytealign_S (w[ 3], w[ 4], offset); w[58] = amd_bytealign_S (w[ 2], w[ 3], offset); w[57] = amd_bytealign_S (w[ 1], w[ 2], offset); w[56] = amd_bytealign_S (w[ 0], w[ 1], offset); w[55] = amd_bytealign_S ( 0, w[ 0], offset); w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 56: w[63] = amd_bytealign_S (w[ 6], w[ 7], offset); w[62] = amd_bytealign_S (w[ 5], w[ 6], offset); w[61] = amd_bytealign_S (w[ 4], w[ 5], offset); w[60] = amd_bytealign_S (w[ 3], w[ 4], offset); w[59] = amd_bytealign_S (w[ 2], w[ 3], offset); w[58] = amd_bytealign_S (w[ 1], w[ 2], offset); w[57] = amd_bytealign_S (w[ 0], w[ 1], offset); w[56] = amd_bytealign_S ( 0, w[ 0], offset); w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 57: w[63] = amd_bytealign_S (w[ 5], w[ 6], offset); w[62] = amd_bytealign_S (w[ 4], w[ 5], offset); w[61] = amd_bytealign_S (w[ 3], w[ 4], offset); w[60] = amd_bytealign_S (w[ 2], w[ 3], offset); w[59] = amd_bytealign_S (w[ 1], w[ 2], offset); w[58] = amd_bytealign_S (w[ 0], w[ 1], offset); w[57] = amd_bytealign_S ( 0, w[ 0], offset); w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 58: w[63] = amd_bytealign_S (w[ 4], w[ 5], offset); w[62] = amd_bytealign_S (w[ 3], w[ 4], offset); w[61] = amd_bytealign_S (w[ 2], w[ 3], offset); w[60] = amd_bytealign_S (w[ 1], w[ 2], offset); w[59] = amd_bytealign_S (w[ 0], w[ 1], offset); w[58] = amd_bytealign_S ( 0, w[ 0], offset); w[57] = 0; w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 59: w[63] = amd_bytealign_S (w[ 3], w[ 4], offset); w[62] = amd_bytealign_S (w[ 2], w[ 3], offset); w[61] = amd_bytealign_S (w[ 1], w[ 2], offset); w[60] = amd_bytealign_S (w[ 0], w[ 1], offset); w[59] = amd_bytealign_S ( 0, w[ 0], offset); w[58] = 0; w[57] = 0; w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 60: w[63] = amd_bytealign_S (w[ 2], w[ 3], offset); w[62] = amd_bytealign_S (w[ 1], w[ 2], offset); w[61] = amd_bytealign_S (w[ 0], w[ 1], offset); w[60] = amd_bytealign_S ( 0, w[ 0], offset); w[59] = 0; w[58] = 0; w[57] = 0; w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 61: w[63] = amd_bytealign_S (w[ 1], w[ 2], offset); w[62] = amd_bytealign_S (w[ 0], w[ 1], offset); w[61] = amd_bytealign_S ( 0, w[ 0], offset); w[60] = 0; w[59] = 0; w[58] = 0; w[57] = 0; w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 62: w[63] = amd_bytealign_S (w[ 0], w[ 1], offset); w[62] = amd_bytealign_S ( 0, w[ 0], offset); w[61] = 0; w[60] = 0; w[59] = 0; w[58] = 0; w[57] = 0; w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 63: w[63] = amd_bytealign_S ( 0, w[ 0], offset); w[62] = 0; w[61] = 0; w[60] = 0; w[59] = 0; w[58] = 0; w[57] = 0; w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; } #pragma unroll for (int i = 0; i < 64; i++) w[i] = swap32_S (w[i]); #endif #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV #if defined IS_NV const int selector = (0x76543210 >> (offset_minus_4 * 4)) & 0xffff; #endif #if defined IS_AMD const int selector = 0x0706050403020100 >> (offset_minus_4 * 8); #endif switch (offset_switch) { case 0: w[63] = __byte_perm_S (w[62], w[63], selector); w[62] = __byte_perm_S (w[61], w[62], selector); w[61] = __byte_perm_S (w[60], w[61], selector); w[60] = __byte_perm_S (w[59], w[60], selector); w[59] = __byte_perm_S (w[58], w[59], selector); w[58] = __byte_perm_S (w[57], w[58], selector); w[57] = __byte_perm_S (w[56], w[57], selector); w[56] = __byte_perm_S (w[55], w[56], selector); w[55] = __byte_perm_S (w[54], w[55], selector); w[54] = __byte_perm_S (w[53], w[54], selector); w[53] = __byte_perm_S (w[52], w[53], selector); w[52] = __byte_perm_S (w[51], w[52], selector); w[51] = __byte_perm_S (w[50], w[51], selector); w[50] = __byte_perm_S (w[49], w[50], selector); w[49] = __byte_perm_S (w[48], w[49], selector); w[48] = __byte_perm_S (w[47], w[48], selector); w[47] = __byte_perm_S (w[46], w[47], selector); w[46] = __byte_perm_S (w[45], w[46], selector); w[45] = __byte_perm_S (w[44], w[45], selector); w[44] = __byte_perm_S (w[43], w[44], selector); w[43] = __byte_perm_S (w[42], w[43], selector); w[42] = __byte_perm_S (w[41], w[42], selector); w[41] = __byte_perm_S (w[40], w[41], selector); w[40] = __byte_perm_S (w[39], w[40], selector); w[39] = __byte_perm_S (w[38], w[39], selector); w[38] = __byte_perm_S (w[37], w[38], selector); w[37] = __byte_perm_S (w[36], w[37], selector); w[36] = __byte_perm_S (w[35], w[36], selector); w[35] = __byte_perm_S (w[34], w[35], selector); w[34] = __byte_perm_S (w[33], w[34], selector); w[33] = __byte_perm_S (w[32], w[33], selector); w[32] = __byte_perm_S (w[31], w[32], selector); w[31] = __byte_perm_S (w[30], w[31], selector); w[30] = __byte_perm_S (w[29], w[30], selector); w[29] = __byte_perm_S (w[28], w[29], selector); w[28] = __byte_perm_S (w[27], w[28], selector); w[27] = __byte_perm_S (w[26], w[27], selector); w[26] = __byte_perm_S (w[25], w[26], selector); w[25] = __byte_perm_S (w[24], w[25], selector); w[24] = __byte_perm_S (w[23], w[24], selector); w[23] = __byte_perm_S (w[22], w[23], selector); w[22] = __byte_perm_S (w[21], w[22], selector); w[21] = __byte_perm_S (w[20], w[21], selector); w[20] = __byte_perm_S (w[19], w[20], selector); w[19] = __byte_perm_S (w[18], w[19], selector); w[18] = __byte_perm_S (w[17], w[18], selector); w[17] = __byte_perm_S (w[16], w[17], selector); w[16] = __byte_perm_S (w[15], w[16], selector); w[15] = __byte_perm_S (w[14], w[15], selector); w[14] = __byte_perm_S (w[13], w[14], selector); w[13] = __byte_perm_S (w[12], w[13], selector); w[12] = __byte_perm_S (w[11], w[12], selector); w[11] = __byte_perm_S (w[10], w[11], selector); w[10] = __byte_perm_S (w[ 9], w[10], selector); w[ 9] = __byte_perm_S (w[ 8], w[ 9], selector); w[ 8] = __byte_perm_S (w[ 7], w[ 8], selector); w[ 7] = __byte_perm_S (w[ 6], w[ 7], selector); w[ 6] = __byte_perm_S (w[ 5], w[ 6], selector); w[ 5] = __byte_perm_S (w[ 4], w[ 5], selector); w[ 4] = __byte_perm_S (w[ 3], w[ 4], selector); w[ 3] = __byte_perm_S (w[ 2], w[ 3], selector); w[ 2] = __byte_perm_S (w[ 1], w[ 2], selector); w[ 1] = __byte_perm_S (w[ 0], w[ 1], selector); w[ 0] = __byte_perm_S ( 0, w[ 0], selector); break; case 1: w[63] = __byte_perm_S (w[61], w[62], selector); w[62] = __byte_perm_S (w[60], w[61], selector); w[61] = __byte_perm_S (w[59], w[60], selector); w[60] = __byte_perm_S (w[58], w[59], selector); w[59] = __byte_perm_S (w[57], w[58], selector); w[58] = __byte_perm_S (w[56], w[57], selector); w[57] = __byte_perm_S (w[55], w[56], selector); w[56] = __byte_perm_S (w[54], w[55], selector); w[55] = __byte_perm_S (w[53], w[54], selector); w[54] = __byte_perm_S (w[52], w[53], selector); w[53] = __byte_perm_S (w[51], w[52], selector); w[52] = __byte_perm_S (w[50], w[51], selector); w[51] = __byte_perm_S (w[49], w[50], selector); w[50] = __byte_perm_S (w[48], w[49], selector); w[49] = __byte_perm_S (w[47], w[48], selector); w[48] = __byte_perm_S (w[46], w[47], selector); w[47] = __byte_perm_S (w[45], w[46], selector); w[46] = __byte_perm_S (w[44], w[45], selector); w[45] = __byte_perm_S (w[43], w[44], selector); w[44] = __byte_perm_S (w[42], w[43], selector); w[43] = __byte_perm_S (w[41], w[42], selector); w[42] = __byte_perm_S (w[40], w[41], selector); w[41] = __byte_perm_S (w[39], w[40], selector); w[40] = __byte_perm_S (w[38], w[39], selector); w[39] = __byte_perm_S (w[37], w[38], selector); w[38] = __byte_perm_S (w[36], w[37], selector); w[37] = __byte_perm_S (w[35], w[36], selector); w[36] = __byte_perm_S (w[34], w[35], selector); w[35] = __byte_perm_S (w[33], w[34], selector); w[34] = __byte_perm_S (w[32], w[33], selector); w[33] = __byte_perm_S (w[31], w[32], selector); w[32] = __byte_perm_S (w[30], w[31], selector); w[31] = __byte_perm_S (w[29], w[30], selector); w[30] = __byte_perm_S (w[28], w[29], selector); w[29] = __byte_perm_S (w[27], w[28], selector); w[28] = __byte_perm_S (w[26], w[27], selector); w[27] = __byte_perm_S (w[25], w[26], selector); w[26] = __byte_perm_S (w[24], w[25], selector); w[25] = __byte_perm_S (w[23], w[24], selector); w[24] = __byte_perm_S (w[22], w[23], selector); w[23] = __byte_perm_S (w[21], w[22], selector); w[22] = __byte_perm_S (w[20], w[21], selector); w[21] = __byte_perm_S (w[19], w[20], selector); w[20] = __byte_perm_S (w[18], w[19], selector); w[19] = __byte_perm_S (w[17], w[18], selector); w[18] = __byte_perm_S (w[16], w[17], selector); w[17] = __byte_perm_S (w[15], w[16], selector); w[16] = __byte_perm_S (w[14], w[15], selector); w[15] = __byte_perm_S (w[13], w[14], selector); w[14] = __byte_perm_S (w[12], w[13], selector); w[13] = __byte_perm_S (w[11], w[12], selector); w[12] = __byte_perm_S (w[10], w[11], selector); w[11] = __byte_perm_S (w[ 9], w[10], selector); w[10] = __byte_perm_S (w[ 8], w[ 9], selector); w[ 9] = __byte_perm_S (w[ 7], w[ 8], selector); w[ 8] = __byte_perm_S (w[ 6], w[ 7], selector); w[ 7] = __byte_perm_S (w[ 5], w[ 6], selector); w[ 6] = __byte_perm_S (w[ 4], w[ 5], selector); w[ 5] = __byte_perm_S (w[ 3], w[ 4], selector); w[ 4] = __byte_perm_S (w[ 2], w[ 3], selector); w[ 3] = __byte_perm_S (w[ 1], w[ 2], selector); w[ 2] = __byte_perm_S (w[ 0], w[ 1], selector); w[ 1] = __byte_perm_S ( 0, w[ 0], selector); w[ 0] = 0; break; case 2: w[63] = __byte_perm_S (w[60], w[61], selector); w[62] = __byte_perm_S (w[59], w[60], selector); w[61] = __byte_perm_S (w[58], w[59], selector); w[60] = __byte_perm_S (w[57], w[58], selector); w[59] = __byte_perm_S (w[56], w[57], selector); w[58] = __byte_perm_S (w[55], w[56], selector); w[57] = __byte_perm_S (w[54], w[55], selector); w[56] = __byte_perm_S (w[53], w[54], selector); w[55] = __byte_perm_S (w[52], w[53], selector); w[54] = __byte_perm_S (w[51], w[52], selector); w[53] = __byte_perm_S (w[50], w[51], selector); w[52] = __byte_perm_S (w[49], w[50], selector); w[51] = __byte_perm_S (w[48], w[49], selector); w[50] = __byte_perm_S (w[47], w[48], selector); w[49] = __byte_perm_S (w[46], w[47], selector); w[48] = __byte_perm_S (w[45], w[46], selector); w[47] = __byte_perm_S (w[44], w[45], selector); w[46] = __byte_perm_S (w[43], w[44], selector); w[45] = __byte_perm_S (w[42], w[43], selector); w[44] = __byte_perm_S (w[41], w[42], selector); w[43] = __byte_perm_S (w[40], w[41], selector); w[42] = __byte_perm_S (w[39], w[40], selector); w[41] = __byte_perm_S (w[38], w[39], selector); w[40] = __byte_perm_S (w[37], w[38], selector); w[39] = __byte_perm_S (w[36], w[37], selector); w[38] = __byte_perm_S (w[35], w[36], selector); w[37] = __byte_perm_S (w[34], w[35], selector); w[36] = __byte_perm_S (w[33], w[34], selector); w[35] = __byte_perm_S (w[32], w[33], selector); w[34] = __byte_perm_S (w[31], w[32], selector); w[33] = __byte_perm_S (w[30], w[31], selector); w[32] = __byte_perm_S (w[29], w[30], selector); w[31] = __byte_perm_S (w[28], w[29], selector); w[30] = __byte_perm_S (w[27], w[28], selector); w[29] = __byte_perm_S (w[26], w[27], selector); w[28] = __byte_perm_S (w[25], w[26], selector); w[27] = __byte_perm_S (w[24], w[25], selector); w[26] = __byte_perm_S (w[23], w[24], selector); w[25] = __byte_perm_S (w[22], w[23], selector); w[24] = __byte_perm_S (w[21], w[22], selector); w[23] = __byte_perm_S (w[20], w[21], selector); w[22] = __byte_perm_S (w[19], w[20], selector); w[21] = __byte_perm_S (w[18], w[19], selector); w[20] = __byte_perm_S (w[17], w[18], selector); w[19] = __byte_perm_S (w[16], w[17], selector); w[18] = __byte_perm_S (w[15], w[16], selector); w[17] = __byte_perm_S (w[14], w[15], selector); w[16] = __byte_perm_S (w[13], w[14], selector); w[15] = __byte_perm_S (w[12], w[13], selector); w[14] = __byte_perm_S (w[11], w[12], selector); w[13] = __byte_perm_S (w[10], w[11], selector); w[12] = __byte_perm_S (w[ 9], w[10], selector); w[11] = __byte_perm_S (w[ 8], w[ 9], selector); w[10] = __byte_perm_S (w[ 7], w[ 8], selector); w[ 9] = __byte_perm_S (w[ 6], w[ 7], selector); w[ 8] = __byte_perm_S (w[ 5], w[ 6], selector); w[ 7] = __byte_perm_S (w[ 4], w[ 5], selector); w[ 6] = __byte_perm_S (w[ 3], w[ 4], selector); w[ 5] = __byte_perm_S (w[ 2], w[ 3], selector); w[ 4] = __byte_perm_S (w[ 1], w[ 2], selector); w[ 3] = __byte_perm_S (w[ 0], w[ 1], selector); w[ 2] = __byte_perm_S ( 0, w[ 0], selector); w[ 1] = 0; w[ 0] = 0; break; case 3: w[63] = __byte_perm_S (w[59], w[60], selector); w[62] = __byte_perm_S (w[58], w[59], selector); w[61] = __byte_perm_S (w[57], w[58], selector); w[60] = __byte_perm_S (w[56], w[57], selector); w[59] = __byte_perm_S (w[55], w[56], selector); w[58] = __byte_perm_S (w[54], w[55], selector); w[57] = __byte_perm_S (w[53], w[54], selector); w[56] = __byte_perm_S (w[52], w[53], selector); w[55] = __byte_perm_S (w[51], w[52], selector); w[54] = __byte_perm_S (w[50], w[51], selector); w[53] = __byte_perm_S (w[49], w[50], selector); w[52] = __byte_perm_S (w[48], w[49], selector); w[51] = __byte_perm_S (w[47], w[48], selector); w[50] = __byte_perm_S (w[46], w[47], selector); w[49] = __byte_perm_S (w[45], w[46], selector); w[48] = __byte_perm_S (w[44], w[45], selector); w[47] = __byte_perm_S (w[43], w[44], selector); w[46] = __byte_perm_S (w[42], w[43], selector); w[45] = __byte_perm_S (w[41], w[42], selector); w[44] = __byte_perm_S (w[40], w[41], selector); w[43] = __byte_perm_S (w[39], w[40], selector); w[42] = __byte_perm_S (w[38], w[39], selector); w[41] = __byte_perm_S (w[37], w[38], selector); w[40] = __byte_perm_S (w[36], w[37], selector); w[39] = __byte_perm_S (w[35], w[36], selector); w[38] = __byte_perm_S (w[34], w[35], selector); w[37] = __byte_perm_S (w[33], w[34], selector); w[36] = __byte_perm_S (w[32], w[33], selector); w[35] = __byte_perm_S (w[31], w[32], selector); w[34] = __byte_perm_S (w[30], w[31], selector); w[33] = __byte_perm_S (w[29], w[30], selector); w[32] = __byte_perm_S (w[28], w[29], selector); w[31] = __byte_perm_S (w[27], w[28], selector); w[30] = __byte_perm_S (w[26], w[27], selector); w[29] = __byte_perm_S (w[25], w[26], selector); w[28] = __byte_perm_S (w[24], w[25], selector); w[27] = __byte_perm_S (w[23], w[24], selector); w[26] = __byte_perm_S (w[22], w[23], selector); w[25] = __byte_perm_S (w[21], w[22], selector); w[24] = __byte_perm_S (w[20], w[21], selector); w[23] = __byte_perm_S (w[19], w[20], selector); w[22] = __byte_perm_S (w[18], w[19], selector); w[21] = __byte_perm_S (w[17], w[18], selector); w[20] = __byte_perm_S (w[16], w[17], selector); w[19] = __byte_perm_S (w[15], w[16], selector); w[18] = __byte_perm_S (w[14], w[15], selector); w[17] = __byte_perm_S (w[13], w[14], selector); w[16] = __byte_perm_S (w[12], w[13], selector); w[15] = __byte_perm_S (w[11], w[12], selector); w[14] = __byte_perm_S (w[10], w[11], selector); w[13] = __byte_perm_S (w[ 9], w[10], selector); w[12] = __byte_perm_S (w[ 8], w[ 9], selector); w[11] = __byte_perm_S (w[ 7], w[ 8], selector); w[10] = __byte_perm_S (w[ 6], w[ 7], selector); w[ 9] = __byte_perm_S (w[ 5], w[ 6], selector); w[ 8] = __byte_perm_S (w[ 4], w[ 5], selector); w[ 7] = __byte_perm_S (w[ 3], w[ 4], selector); w[ 6] = __byte_perm_S (w[ 2], w[ 3], selector); w[ 5] = __byte_perm_S (w[ 1], w[ 2], selector); w[ 4] = __byte_perm_S (w[ 0], w[ 1], selector); w[ 3] = __byte_perm_S ( 0, w[ 0], selector); w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 4: w[63] = __byte_perm_S (w[58], w[59], selector); w[62] = __byte_perm_S (w[57], w[58], selector); w[61] = __byte_perm_S (w[56], w[57], selector); w[60] = __byte_perm_S (w[55], w[56], selector); w[59] = __byte_perm_S (w[54], w[55], selector); w[58] = __byte_perm_S (w[53], w[54], selector); w[57] = __byte_perm_S (w[52], w[53], selector); w[56] = __byte_perm_S (w[51], w[52], selector); w[55] = __byte_perm_S (w[50], w[51], selector); w[54] = __byte_perm_S (w[49], w[50], selector); w[53] = __byte_perm_S (w[48], w[49], selector); w[52] = __byte_perm_S (w[47], w[48], selector); w[51] = __byte_perm_S (w[46], w[47], selector); w[50] = __byte_perm_S (w[45], w[46], selector); w[49] = __byte_perm_S (w[44], w[45], selector); w[48] = __byte_perm_S (w[43], w[44], selector); w[47] = __byte_perm_S (w[42], w[43], selector); w[46] = __byte_perm_S (w[41], w[42], selector); w[45] = __byte_perm_S (w[40], w[41], selector); w[44] = __byte_perm_S (w[39], w[40], selector); w[43] = __byte_perm_S (w[38], w[39], selector); w[42] = __byte_perm_S (w[37], w[38], selector); w[41] = __byte_perm_S (w[36], w[37], selector); w[40] = __byte_perm_S (w[35], w[36], selector); w[39] = __byte_perm_S (w[34], w[35], selector); w[38] = __byte_perm_S (w[33], w[34], selector); w[37] = __byte_perm_S (w[32], w[33], selector); w[36] = __byte_perm_S (w[31], w[32], selector); w[35] = __byte_perm_S (w[30], w[31], selector); w[34] = __byte_perm_S (w[29], w[30], selector); w[33] = __byte_perm_S (w[28], w[29], selector); w[32] = __byte_perm_S (w[27], w[28], selector); w[31] = __byte_perm_S (w[26], w[27], selector); w[30] = __byte_perm_S (w[25], w[26], selector); w[29] = __byte_perm_S (w[24], w[25], selector); w[28] = __byte_perm_S (w[23], w[24], selector); w[27] = __byte_perm_S (w[22], w[23], selector); w[26] = __byte_perm_S (w[21], w[22], selector); w[25] = __byte_perm_S (w[20], w[21], selector); w[24] = __byte_perm_S (w[19], w[20], selector); w[23] = __byte_perm_S (w[18], w[19], selector); w[22] = __byte_perm_S (w[17], w[18], selector); w[21] = __byte_perm_S (w[16], w[17], selector); w[20] = __byte_perm_S (w[15], w[16], selector); w[19] = __byte_perm_S (w[14], w[15], selector); w[18] = __byte_perm_S (w[13], w[14], selector); w[17] = __byte_perm_S (w[12], w[13], selector); w[16] = __byte_perm_S (w[11], w[12], selector); w[15] = __byte_perm_S (w[10], w[11], selector); w[14] = __byte_perm_S (w[ 9], w[10], selector); w[13] = __byte_perm_S (w[ 8], w[ 9], selector); w[12] = __byte_perm_S (w[ 7], w[ 8], selector); w[11] = __byte_perm_S (w[ 6], w[ 7], selector); w[10] = __byte_perm_S (w[ 5], w[ 6], selector); w[ 9] = __byte_perm_S (w[ 4], w[ 5], selector); w[ 8] = __byte_perm_S (w[ 3], w[ 4], selector); w[ 7] = __byte_perm_S (w[ 2], w[ 3], selector); w[ 6] = __byte_perm_S (w[ 1], w[ 2], selector); w[ 5] = __byte_perm_S (w[ 0], w[ 1], selector); w[ 4] = __byte_perm_S ( 0, w[ 0], selector); w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 5: w[63] = __byte_perm_S (w[57], w[58], selector); w[62] = __byte_perm_S (w[56], w[57], selector); w[61] = __byte_perm_S (w[55], w[56], selector); w[60] = __byte_perm_S (w[54], w[55], selector); w[59] = __byte_perm_S (w[53], w[54], selector); w[58] = __byte_perm_S (w[52], w[53], selector); w[57] = __byte_perm_S (w[51], w[52], selector); w[56] = __byte_perm_S (w[50], w[51], selector); w[55] = __byte_perm_S (w[49], w[50], selector); w[54] = __byte_perm_S (w[48], w[49], selector); w[53] = __byte_perm_S (w[47], w[48], selector); w[52] = __byte_perm_S (w[46], w[47], selector); w[51] = __byte_perm_S (w[45], w[46], selector); w[50] = __byte_perm_S (w[44], w[45], selector); w[49] = __byte_perm_S (w[43], w[44], selector); w[48] = __byte_perm_S (w[42], w[43], selector); w[47] = __byte_perm_S (w[41], w[42], selector); w[46] = __byte_perm_S (w[40], w[41], selector); w[45] = __byte_perm_S (w[39], w[40], selector); w[44] = __byte_perm_S (w[38], w[39], selector); w[43] = __byte_perm_S (w[37], w[38], selector); w[42] = __byte_perm_S (w[36], w[37], selector); w[41] = __byte_perm_S (w[35], w[36], selector); w[40] = __byte_perm_S (w[34], w[35], selector); w[39] = __byte_perm_S (w[33], w[34], selector); w[38] = __byte_perm_S (w[32], w[33], selector); w[37] = __byte_perm_S (w[31], w[32], selector); w[36] = __byte_perm_S (w[30], w[31], selector); w[35] = __byte_perm_S (w[29], w[30], selector); w[34] = __byte_perm_S (w[28], w[29], selector); w[33] = __byte_perm_S (w[27], w[28], selector); w[32] = __byte_perm_S (w[26], w[27], selector); w[31] = __byte_perm_S (w[25], w[26], selector); w[30] = __byte_perm_S (w[24], w[25], selector); w[29] = __byte_perm_S (w[23], w[24], selector); w[28] = __byte_perm_S (w[22], w[23], selector); w[27] = __byte_perm_S (w[21], w[22], selector); w[26] = __byte_perm_S (w[20], w[21], selector); w[25] = __byte_perm_S (w[19], w[20], selector); w[24] = __byte_perm_S (w[18], w[19], selector); w[23] = __byte_perm_S (w[17], w[18], selector); w[22] = __byte_perm_S (w[16], w[17], selector); w[21] = __byte_perm_S (w[15], w[16], selector); w[20] = __byte_perm_S (w[14], w[15], selector); w[19] = __byte_perm_S (w[13], w[14], selector); w[18] = __byte_perm_S (w[12], w[13], selector); w[17] = __byte_perm_S (w[11], w[12], selector); w[16] = __byte_perm_S (w[10], w[11], selector); w[15] = __byte_perm_S (w[ 9], w[10], selector); w[14] = __byte_perm_S (w[ 8], w[ 9], selector); w[13] = __byte_perm_S (w[ 7], w[ 8], selector); w[12] = __byte_perm_S (w[ 6], w[ 7], selector); w[11] = __byte_perm_S (w[ 5], w[ 6], selector); w[10] = __byte_perm_S (w[ 4], w[ 5], selector); w[ 9] = __byte_perm_S (w[ 3], w[ 4], selector); w[ 8] = __byte_perm_S (w[ 2], w[ 3], selector); w[ 7] = __byte_perm_S (w[ 1], w[ 2], selector); w[ 6] = __byte_perm_S (w[ 0], w[ 1], selector); w[ 5] = __byte_perm_S ( 0, w[ 0], selector); w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 6: w[63] = __byte_perm_S (w[56], w[57], selector); w[62] = __byte_perm_S (w[55], w[56], selector); w[61] = __byte_perm_S (w[54], w[55], selector); w[60] = __byte_perm_S (w[53], w[54], selector); w[59] = __byte_perm_S (w[52], w[53], selector); w[58] = __byte_perm_S (w[51], w[52], selector); w[57] = __byte_perm_S (w[50], w[51], selector); w[56] = __byte_perm_S (w[49], w[50], selector); w[55] = __byte_perm_S (w[48], w[49], selector); w[54] = __byte_perm_S (w[47], w[48], selector); w[53] = __byte_perm_S (w[46], w[47], selector); w[52] = __byte_perm_S (w[45], w[46], selector); w[51] = __byte_perm_S (w[44], w[45], selector); w[50] = __byte_perm_S (w[43], w[44], selector); w[49] = __byte_perm_S (w[42], w[43], selector); w[48] = __byte_perm_S (w[41], w[42], selector); w[47] = __byte_perm_S (w[40], w[41], selector); w[46] = __byte_perm_S (w[39], w[40], selector); w[45] = __byte_perm_S (w[38], w[39], selector); w[44] = __byte_perm_S (w[37], w[38], selector); w[43] = __byte_perm_S (w[36], w[37], selector); w[42] = __byte_perm_S (w[35], w[36], selector); w[41] = __byte_perm_S (w[34], w[35], selector); w[40] = __byte_perm_S (w[33], w[34], selector); w[39] = __byte_perm_S (w[32], w[33], selector); w[38] = __byte_perm_S (w[31], w[32], selector); w[37] = __byte_perm_S (w[30], w[31], selector); w[36] = __byte_perm_S (w[29], w[30], selector); w[35] = __byte_perm_S (w[28], w[29], selector); w[34] = __byte_perm_S (w[27], w[28], selector); w[33] = __byte_perm_S (w[26], w[27], selector); w[32] = __byte_perm_S (w[25], w[26], selector); w[31] = __byte_perm_S (w[24], w[25], selector); w[30] = __byte_perm_S (w[23], w[24], selector); w[29] = __byte_perm_S (w[22], w[23], selector); w[28] = __byte_perm_S (w[21], w[22], selector); w[27] = __byte_perm_S (w[20], w[21], selector); w[26] = __byte_perm_S (w[19], w[20], selector); w[25] = __byte_perm_S (w[18], w[19], selector); w[24] = __byte_perm_S (w[17], w[18], selector); w[23] = __byte_perm_S (w[16], w[17], selector); w[22] = __byte_perm_S (w[15], w[16], selector); w[21] = __byte_perm_S (w[14], w[15], selector); w[20] = __byte_perm_S (w[13], w[14], selector); w[19] = __byte_perm_S (w[12], w[13], selector); w[18] = __byte_perm_S (w[11], w[12], selector); w[17] = __byte_perm_S (w[10], w[11], selector); w[16] = __byte_perm_S (w[ 9], w[10], selector); w[15] = __byte_perm_S (w[ 8], w[ 9], selector); w[14] = __byte_perm_S (w[ 7], w[ 8], selector); w[13] = __byte_perm_S (w[ 6], w[ 7], selector); w[12] = __byte_perm_S (w[ 5], w[ 6], selector); w[11] = __byte_perm_S (w[ 4], w[ 5], selector); w[10] = __byte_perm_S (w[ 3], w[ 4], selector); w[ 9] = __byte_perm_S (w[ 2], w[ 3], selector); w[ 8] = __byte_perm_S (w[ 1], w[ 2], selector); w[ 7] = __byte_perm_S (w[ 0], w[ 1], selector); w[ 6] = __byte_perm_S ( 0, w[ 0], selector); w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 7: w[63] = __byte_perm_S (w[55], w[56], selector); w[62] = __byte_perm_S (w[54], w[55], selector); w[61] = __byte_perm_S (w[53], w[54], selector); w[60] = __byte_perm_S (w[52], w[53], selector); w[59] = __byte_perm_S (w[51], w[52], selector); w[58] = __byte_perm_S (w[50], w[51], selector); w[57] = __byte_perm_S (w[49], w[50], selector); w[56] = __byte_perm_S (w[48], w[49], selector); w[55] = __byte_perm_S (w[47], w[48], selector); w[54] = __byte_perm_S (w[46], w[47], selector); w[53] = __byte_perm_S (w[45], w[46], selector); w[52] = __byte_perm_S (w[44], w[45], selector); w[51] = __byte_perm_S (w[43], w[44], selector); w[50] = __byte_perm_S (w[42], w[43], selector); w[49] = __byte_perm_S (w[41], w[42], selector); w[48] = __byte_perm_S (w[40], w[41], selector); w[47] = __byte_perm_S (w[39], w[40], selector); w[46] = __byte_perm_S (w[38], w[39], selector); w[45] = __byte_perm_S (w[37], w[38], selector); w[44] = __byte_perm_S (w[36], w[37], selector); w[43] = __byte_perm_S (w[35], w[36], selector); w[42] = __byte_perm_S (w[34], w[35], selector); w[41] = __byte_perm_S (w[33], w[34], selector); w[40] = __byte_perm_S (w[32], w[33], selector); w[39] = __byte_perm_S (w[31], w[32], selector); w[38] = __byte_perm_S (w[30], w[31], selector); w[37] = __byte_perm_S (w[29], w[30], selector); w[36] = __byte_perm_S (w[28], w[29], selector); w[35] = __byte_perm_S (w[27], w[28], selector); w[34] = __byte_perm_S (w[26], w[27], selector); w[33] = __byte_perm_S (w[25], w[26], selector); w[32] = __byte_perm_S (w[24], w[25], selector); w[31] = __byte_perm_S (w[23], w[24], selector); w[30] = __byte_perm_S (w[22], w[23], selector); w[29] = __byte_perm_S (w[21], w[22], selector); w[28] = __byte_perm_S (w[20], w[21], selector); w[27] = __byte_perm_S (w[19], w[20], selector); w[26] = __byte_perm_S (w[18], w[19], selector); w[25] = __byte_perm_S (w[17], w[18], selector); w[24] = __byte_perm_S (w[16], w[17], selector); w[23] = __byte_perm_S (w[15], w[16], selector); w[22] = __byte_perm_S (w[14], w[15], selector); w[21] = __byte_perm_S (w[13], w[14], selector); w[20] = __byte_perm_S (w[12], w[13], selector); w[19] = __byte_perm_S (w[11], w[12], selector); w[18] = __byte_perm_S (w[10], w[11], selector); w[17] = __byte_perm_S (w[ 9], w[10], selector); w[16] = __byte_perm_S (w[ 8], w[ 9], selector); w[15] = __byte_perm_S (w[ 7], w[ 8], selector); w[14] = __byte_perm_S (w[ 6], w[ 7], selector); w[13] = __byte_perm_S (w[ 5], w[ 6], selector); w[12] = __byte_perm_S (w[ 4], w[ 5], selector); w[11] = __byte_perm_S (w[ 3], w[ 4], selector); w[10] = __byte_perm_S (w[ 2], w[ 3], selector); w[ 9] = __byte_perm_S (w[ 1], w[ 2], selector); w[ 8] = __byte_perm_S (w[ 0], w[ 1], selector); w[ 7] = __byte_perm_S ( 0, w[ 0], selector); w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 8: w[63] = __byte_perm_S (w[54], w[55], selector); w[62] = __byte_perm_S (w[53], w[54], selector); w[61] = __byte_perm_S (w[52], w[53], selector); w[60] = __byte_perm_S (w[51], w[52], selector); w[59] = __byte_perm_S (w[50], w[51], selector); w[58] = __byte_perm_S (w[49], w[50], selector); w[57] = __byte_perm_S (w[48], w[49], selector); w[56] = __byte_perm_S (w[47], w[48], selector); w[55] = __byte_perm_S (w[46], w[47], selector); w[54] = __byte_perm_S (w[45], w[46], selector); w[53] = __byte_perm_S (w[44], w[45], selector); w[52] = __byte_perm_S (w[43], w[44], selector); w[51] = __byte_perm_S (w[42], w[43], selector); w[50] = __byte_perm_S (w[41], w[42], selector); w[49] = __byte_perm_S (w[40], w[41], selector); w[48] = __byte_perm_S (w[39], w[40], selector); w[47] = __byte_perm_S (w[38], w[39], selector); w[46] = __byte_perm_S (w[37], w[38], selector); w[45] = __byte_perm_S (w[36], w[37], selector); w[44] = __byte_perm_S (w[35], w[36], selector); w[43] = __byte_perm_S (w[34], w[35], selector); w[42] = __byte_perm_S (w[33], w[34], selector); w[41] = __byte_perm_S (w[32], w[33], selector); w[40] = __byte_perm_S (w[31], w[32], selector); w[39] = __byte_perm_S (w[30], w[31], selector); w[38] = __byte_perm_S (w[29], w[30], selector); w[37] = __byte_perm_S (w[28], w[29], selector); w[36] = __byte_perm_S (w[27], w[28], selector); w[35] = __byte_perm_S (w[26], w[27], selector); w[34] = __byte_perm_S (w[25], w[26], selector); w[33] = __byte_perm_S (w[24], w[25], selector); w[32] = __byte_perm_S (w[23], w[24], selector); w[31] = __byte_perm_S (w[22], w[23], selector); w[30] = __byte_perm_S (w[21], w[22], selector); w[29] = __byte_perm_S (w[20], w[21], selector); w[28] = __byte_perm_S (w[19], w[20], selector); w[27] = __byte_perm_S (w[18], w[19], selector); w[26] = __byte_perm_S (w[17], w[18], selector); w[25] = __byte_perm_S (w[16], w[17], selector); w[24] = __byte_perm_S (w[15], w[16], selector); w[23] = __byte_perm_S (w[14], w[15], selector); w[22] = __byte_perm_S (w[13], w[14], selector); w[21] = __byte_perm_S (w[12], w[13], selector); w[20] = __byte_perm_S (w[11], w[12], selector); w[19] = __byte_perm_S (w[10], w[11], selector); w[18] = __byte_perm_S (w[ 9], w[10], selector); w[17] = __byte_perm_S (w[ 8], w[ 9], selector); w[16] = __byte_perm_S (w[ 7], w[ 8], selector); w[15] = __byte_perm_S (w[ 6], w[ 7], selector); w[14] = __byte_perm_S (w[ 5], w[ 6], selector); w[13] = __byte_perm_S (w[ 4], w[ 5], selector); w[12] = __byte_perm_S (w[ 3], w[ 4], selector); w[11] = __byte_perm_S (w[ 2], w[ 3], selector); w[10] = __byte_perm_S (w[ 1], w[ 2], selector); w[ 9] = __byte_perm_S (w[ 0], w[ 1], selector); w[ 8] = __byte_perm_S ( 0, w[ 0], selector); w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 9: w[63] = __byte_perm_S (w[53], w[54], selector); w[62] = __byte_perm_S (w[52], w[53], selector); w[61] = __byte_perm_S (w[51], w[52], selector); w[60] = __byte_perm_S (w[50], w[51], selector); w[59] = __byte_perm_S (w[49], w[50], selector); w[58] = __byte_perm_S (w[48], w[49], selector); w[57] = __byte_perm_S (w[47], w[48], selector); w[56] = __byte_perm_S (w[46], w[47], selector); w[55] = __byte_perm_S (w[45], w[46], selector); w[54] = __byte_perm_S (w[44], w[45], selector); w[53] = __byte_perm_S (w[43], w[44], selector); w[52] = __byte_perm_S (w[42], w[43], selector); w[51] = __byte_perm_S (w[41], w[42], selector); w[50] = __byte_perm_S (w[40], w[41], selector); w[49] = __byte_perm_S (w[39], w[40], selector); w[48] = __byte_perm_S (w[38], w[39], selector); w[47] = __byte_perm_S (w[37], w[38], selector); w[46] = __byte_perm_S (w[36], w[37], selector); w[45] = __byte_perm_S (w[35], w[36], selector); w[44] = __byte_perm_S (w[34], w[35], selector); w[43] = __byte_perm_S (w[33], w[34], selector); w[42] = __byte_perm_S (w[32], w[33], selector); w[41] = __byte_perm_S (w[31], w[32], selector); w[40] = __byte_perm_S (w[30], w[31], selector); w[39] = __byte_perm_S (w[29], w[30], selector); w[38] = __byte_perm_S (w[28], w[29], selector); w[37] = __byte_perm_S (w[27], w[28], selector); w[36] = __byte_perm_S (w[26], w[27], selector); w[35] = __byte_perm_S (w[25], w[26], selector); w[34] = __byte_perm_S (w[24], w[25], selector); w[33] = __byte_perm_S (w[23], w[24], selector); w[32] = __byte_perm_S (w[22], w[23], selector); w[31] = __byte_perm_S (w[21], w[22], selector); w[30] = __byte_perm_S (w[20], w[21], selector); w[29] = __byte_perm_S (w[19], w[20], selector); w[28] = __byte_perm_S (w[18], w[19], selector); w[27] = __byte_perm_S (w[17], w[18], selector); w[26] = __byte_perm_S (w[16], w[17], selector); w[25] = __byte_perm_S (w[15], w[16], selector); w[24] = __byte_perm_S (w[14], w[15], selector); w[23] = __byte_perm_S (w[13], w[14], selector); w[22] = __byte_perm_S (w[12], w[13], selector); w[21] = __byte_perm_S (w[11], w[12], selector); w[20] = __byte_perm_S (w[10], w[11], selector); w[19] = __byte_perm_S (w[ 9], w[10], selector); w[18] = __byte_perm_S (w[ 8], w[ 9], selector); w[17] = __byte_perm_S (w[ 7], w[ 8], selector); w[16] = __byte_perm_S (w[ 6], w[ 7], selector); w[15] = __byte_perm_S (w[ 5], w[ 6], selector); w[14] = __byte_perm_S (w[ 4], w[ 5], selector); w[13] = __byte_perm_S (w[ 3], w[ 4], selector); w[12] = __byte_perm_S (w[ 2], w[ 3], selector); w[11] = __byte_perm_S (w[ 1], w[ 2], selector); w[10] = __byte_perm_S (w[ 0], w[ 1], selector); w[ 9] = __byte_perm_S ( 0, w[ 0], selector); w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 10: w[63] = __byte_perm_S (w[52], w[53], selector); w[62] = __byte_perm_S (w[51], w[52], selector); w[61] = __byte_perm_S (w[50], w[51], selector); w[60] = __byte_perm_S (w[49], w[50], selector); w[59] = __byte_perm_S (w[48], w[49], selector); w[58] = __byte_perm_S (w[47], w[48], selector); w[57] = __byte_perm_S (w[46], w[47], selector); w[56] = __byte_perm_S (w[45], w[46], selector); w[55] = __byte_perm_S (w[44], w[45], selector); w[54] = __byte_perm_S (w[43], w[44], selector); w[53] = __byte_perm_S (w[42], w[43], selector); w[52] = __byte_perm_S (w[41], w[42], selector); w[51] = __byte_perm_S (w[40], w[41], selector); w[50] = __byte_perm_S (w[39], w[40], selector); w[49] = __byte_perm_S (w[38], w[39], selector); w[48] = __byte_perm_S (w[37], w[38], selector); w[47] = __byte_perm_S (w[36], w[37], selector); w[46] = __byte_perm_S (w[35], w[36], selector); w[45] = __byte_perm_S (w[34], w[35], selector); w[44] = __byte_perm_S (w[33], w[34], selector); w[43] = __byte_perm_S (w[32], w[33], selector); w[42] = __byte_perm_S (w[31], w[32], selector); w[41] = __byte_perm_S (w[30], w[31], selector); w[40] = __byte_perm_S (w[29], w[30], selector); w[39] = __byte_perm_S (w[28], w[29], selector); w[38] = __byte_perm_S (w[27], w[28], selector); w[37] = __byte_perm_S (w[26], w[27], selector); w[36] = __byte_perm_S (w[25], w[26], selector); w[35] = __byte_perm_S (w[24], w[25], selector); w[34] = __byte_perm_S (w[23], w[24], selector); w[33] = __byte_perm_S (w[22], w[23], selector); w[32] = __byte_perm_S (w[21], w[22], selector); w[31] = __byte_perm_S (w[20], w[21], selector); w[30] = __byte_perm_S (w[19], w[20], selector); w[29] = __byte_perm_S (w[18], w[19], selector); w[28] = __byte_perm_S (w[17], w[18], selector); w[27] = __byte_perm_S (w[16], w[17], selector); w[26] = __byte_perm_S (w[15], w[16], selector); w[25] = __byte_perm_S (w[14], w[15], selector); w[24] = __byte_perm_S (w[13], w[14], selector); w[23] = __byte_perm_S (w[12], w[13], selector); w[22] = __byte_perm_S (w[11], w[12], selector); w[21] = __byte_perm_S (w[10], w[11], selector); w[20] = __byte_perm_S (w[ 9], w[10], selector); w[19] = __byte_perm_S (w[ 8], w[ 9], selector); w[18] = __byte_perm_S (w[ 7], w[ 8], selector); w[17] = __byte_perm_S (w[ 6], w[ 7], selector); w[16] = __byte_perm_S (w[ 5], w[ 6], selector); w[15] = __byte_perm_S (w[ 4], w[ 5], selector); w[14] = __byte_perm_S (w[ 3], w[ 4], selector); w[13] = __byte_perm_S (w[ 2], w[ 3], selector); w[12] = __byte_perm_S (w[ 1], w[ 2], selector); w[11] = __byte_perm_S (w[ 0], w[ 1], selector); w[10] = __byte_perm_S ( 0, w[ 0], selector); w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 11: w[63] = __byte_perm_S (w[51], w[52], selector); w[62] = __byte_perm_S (w[50], w[51], selector); w[61] = __byte_perm_S (w[49], w[50], selector); w[60] = __byte_perm_S (w[48], w[49], selector); w[59] = __byte_perm_S (w[47], w[48], selector); w[58] = __byte_perm_S (w[46], w[47], selector); w[57] = __byte_perm_S (w[45], w[46], selector); w[56] = __byte_perm_S (w[44], w[45], selector); w[55] = __byte_perm_S (w[43], w[44], selector); w[54] = __byte_perm_S (w[42], w[43], selector); w[53] = __byte_perm_S (w[41], w[42], selector); w[52] = __byte_perm_S (w[40], w[41], selector); w[51] = __byte_perm_S (w[39], w[40], selector); w[50] = __byte_perm_S (w[38], w[39], selector); w[49] = __byte_perm_S (w[37], w[38], selector); w[48] = __byte_perm_S (w[36], w[37], selector); w[47] = __byte_perm_S (w[35], w[36], selector); w[46] = __byte_perm_S (w[34], w[35], selector); w[45] = __byte_perm_S (w[33], w[34], selector); w[44] = __byte_perm_S (w[32], w[33], selector); w[43] = __byte_perm_S (w[31], w[32], selector); w[42] = __byte_perm_S (w[30], w[31], selector); w[41] = __byte_perm_S (w[29], w[30], selector); w[40] = __byte_perm_S (w[28], w[29], selector); w[39] = __byte_perm_S (w[27], w[28], selector); w[38] = __byte_perm_S (w[26], w[27], selector); w[37] = __byte_perm_S (w[25], w[26], selector); w[36] = __byte_perm_S (w[24], w[25], selector); w[35] = __byte_perm_S (w[23], w[24], selector); w[34] = __byte_perm_S (w[22], w[23], selector); w[33] = __byte_perm_S (w[21], w[22], selector); w[32] = __byte_perm_S (w[20], w[21], selector); w[31] = __byte_perm_S (w[19], w[20], selector); w[30] = __byte_perm_S (w[18], w[19], selector); w[29] = __byte_perm_S (w[17], w[18], selector); w[28] = __byte_perm_S (w[16], w[17], selector); w[27] = __byte_perm_S (w[15], w[16], selector); w[26] = __byte_perm_S (w[14], w[15], selector); w[25] = __byte_perm_S (w[13], w[14], selector); w[24] = __byte_perm_S (w[12], w[13], selector); w[23] = __byte_perm_S (w[11], w[12], selector); w[22] = __byte_perm_S (w[10], w[11], selector); w[21] = __byte_perm_S (w[ 9], w[10], selector); w[20] = __byte_perm_S (w[ 8], w[ 9], selector); w[19] = __byte_perm_S (w[ 7], w[ 8], selector); w[18] = __byte_perm_S (w[ 6], w[ 7], selector); w[17] = __byte_perm_S (w[ 5], w[ 6], selector); w[16] = __byte_perm_S (w[ 4], w[ 5], selector); w[15] = __byte_perm_S (w[ 3], w[ 4], selector); w[14] = __byte_perm_S (w[ 2], w[ 3], selector); w[13] = __byte_perm_S (w[ 1], w[ 2], selector); w[12] = __byte_perm_S (w[ 0], w[ 1], selector); w[11] = __byte_perm_S ( 0, w[ 0], selector); w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 12: w[63] = __byte_perm_S (w[50], w[51], selector); w[62] = __byte_perm_S (w[49], w[50], selector); w[61] = __byte_perm_S (w[48], w[49], selector); w[60] = __byte_perm_S (w[47], w[48], selector); w[59] = __byte_perm_S (w[46], w[47], selector); w[58] = __byte_perm_S (w[45], w[46], selector); w[57] = __byte_perm_S (w[44], w[45], selector); w[56] = __byte_perm_S (w[43], w[44], selector); w[55] = __byte_perm_S (w[42], w[43], selector); w[54] = __byte_perm_S (w[41], w[42], selector); w[53] = __byte_perm_S (w[40], w[41], selector); w[52] = __byte_perm_S (w[39], w[40], selector); w[51] = __byte_perm_S (w[38], w[39], selector); w[50] = __byte_perm_S (w[37], w[38], selector); w[49] = __byte_perm_S (w[36], w[37], selector); w[48] = __byte_perm_S (w[35], w[36], selector); w[47] = __byte_perm_S (w[34], w[35], selector); w[46] = __byte_perm_S (w[33], w[34], selector); w[45] = __byte_perm_S (w[32], w[33], selector); w[44] = __byte_perm_S (w[31], w[32], selector); w[43] = __byte_perm_S (w[30], w[31], selector); w[42] = __byte_perm_S (w[29], w[30], selector); w[41] = __byte_perm_S (w[28], w[29], selector); w[40] = __byte_perm_S (w[27], w[28], selector); w[39] = __byte_perm_S (w[26], w[27], selector); w[38] = __byte_perm_S (w[25], w[26], selector); w[37] = __byte_perm_S (w[24], w[25], selector); w[36] = __byte_perm_S (w[23], w[24], selector); w[35] = __byte_perm_S (w[22], w[23], selector); w[34] = __byte_perm_S (w[21], w[22], selector); w[33] = __byte_perm_S (w[20], w[21], selector); w[32] = __byte_perm_S (w[19], w[20], selector); w[31] = __byte_perm_S (w[18], w[19], selector); w[30] = __byte_perm_S (w[17], w[18], selector); w[29] = __byte_perm_S (w[16], w[17], selector); w[28] = __byte_perm_S (w[15], w[16], selector); w[27] = __byte_perm_S (w[14], w[15], selector); w[26] = __byte_perm_S (w[13], w[14], selector); w[25] = __byte_perm_S (w[12], w[13], selector); w[24] = __byte_perm_S (w[11], w[12], selector); w[23] = __byte_perm_S (w[10], w[11], selector); w[22] = __byte_perm_S (w[ 9], w[10], selector); w[21] = __byte_perm_S (w[ 8], w[ 9], selector); w[20] = __byte_perm_S (w[ 7], w[ 8], selector); w[19] = __byte_perm_S (w[ 6], w[ 7], selector); w[18] = __byte_perm_S (w[ 5], w[ 6], selector); w[17] = __byte_perm_S (w[ 4], w[ 5], selector); w[16] = __byte_perm_S (w[ 3], w[ 4], selector); w[15] = __byte_perm_S (w[ 2], w[ 3], selector); w[14] = __byte_perm_S (w[ 1], w[ 2], selector); w[13] = __byte_perm_S (w[ 0], w[ 1], selector); w[12] = __byte_perm_S ( 0, w[ 0], selector); w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 13: w[63] = __byte_perm_S (w[49], w[50], selector); w[62] = __byte_perm_S (w[48], w[49], selector); w[61] = __byte_perm_S (w[47], w[48], selector); w[60] = __byte_perm_S (w[46], w[47], selector); w[59] = __byte_perm_S (w[45], w[46], selector); w[58] = __byte_perm_S (w[44], w[45], selector); w[57] = __byte_perm_S (w[43], w[44], selector); w[56] = __byte_perm_S (w[42], w[43], selector); w[55] = __byte_perm_S (w[41], w[42], selector); w[54] = __byte_perm_S (w[40], w[41], selector); w[53] = __byte_perm_S (w[39], w[40], selector); w[52] = __byte_perm_S (w[38], w[39], selector); w[51] = __byte_perm_S (w[37], w[38], selector); w[50] = __byte_perm_S (w[36], w[37], selector); w[49] = __byte_perm_S (w[35], w[36], selector); w[48] = __byte_perm_S (w[34], w[35], selector); w[47] = __byte_perm_S (w[33], w[34], selector); w[46] = __byte_perm_S (w[32], w[33], selector); w[45] = __byte_perm_S (w[31], w[32], selector); w[44] = __byte_perm_S (w[30], w[31], selector); w[43] = __byte_perm_S (w[29], w[30], selector); w[42] = __byte_perm_S (w[28], w[29], selector); w[41] = __byte_perm_S (w[27], w[28], selector); w[40] = __byte_perm_S (w[26], w[27], selector); w[39] = __byte_perm_S (w[25], w[26], selector); w[38] = __byte_perm_S (w[24], w[25], selector); w[37] = __byte_perm_S (w[23], w[24], selector); w[36] = __byte_perm_S (w[22], w[23], selector); w[35] = __byte_perm_S (w[21], w[22], selector); w[34] = __byte_perm_S (w[20], w[21], selector); w[33] = __byte_perm_S (w[19], w[20], selector); w[32] = __byte_perm_S (w[18], w[19], selector); w[31] = __byte_perm_S (w[17], w[18], selector); w[30] = __byte_perm_S (w[16], w[17], selector); w[29] = __byte_perm_S (w[15], w[16], selector); w[28] = __byte_perm_S (w[14], w[15], selector); w[27] = __byte_perm_S (w[13], w[14], selector); w[26] = __byte_perm_S (w[12], w[13], selector); w[25] = __byte_perm_S (w[11], w[12], selector); w[24] = __byte_perm_S (w[10], w[11], selector); w[23] = __byte_perm_S (w[ 9], w[10], selector); w[22] = __byte_perm_S (w[ 8], w[ 9], selector); w[21] = __byte_perm_S (w[ 7], w[ 8], selector); w[20] = __byte_perm_S (w[ 6], w[ 7], selector); w[19] = __byte_perm_S (w[ 5], w[ 6], selector); w[18] = __byte_perm_S (w[ 4], w[ 5], selector); w[17] = __byte_perm_S (w[ 3], w[ 4], selector); w[16] = __byte_perm_S (w[ 2], w[ 3], selector); w[15] = __byte_perm_S (w[ 1], w[ 2], selector); w[14] = __byte_perm_S (w[ 0], w[ 1], selector); w[13] = __byte_perm_S ( 0, w[ 0], selector); w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 14: w[63] = __byte_perm_S (w[48], w[49], selector); w[62] = __byte_perm_S (w[47], w[48], selector); w[61] = __byte_perm_S (w[46], w[47], selector); w[60] = __byte_perm_S (w[45], w[46], selector); w[59] = __byte_perm_S (w[44], w[45], selector); w[58] = __byte_perm_S (w[43], w[44], selector); w[57] = __byte_perm_S (w[42], w[43], selector); w[56] = __byte_perm_S (w[41], w[42], selector); w[55] = __byte_perm_S (w[40], w[41], selector); w[54] = __byte_perm_S (w[39], w[40], selector); w[53] = __byte_perm_S (w[38], w[39], selector); w[52] = __byte_perm_S (w[37], w[38], selector); w[51] = __byte_perm_S (w[36], w[37], selector); w[50] = __byte_perm_S (w[35], w[36], selector); w[49] = __byte_perm_S (w[34], w[35], selector); w[48] = __byte_perm_S (w[33], w[34], selector); w[47] = __byte_perm_S (w[32], w[33], selector); w[46] = __byte_perm_S (w[31], w[32], selector); w[45] = __byte_perm_S (w[30], w[31], selector); w[44] = __byte_perm_S (w[29], w[30], selector); w[43] = __byte_perm_S (w[28], w[29], selector); w[42] = __byte_perm_S (w[27], w[28], selector); w[41] = __byte_perm_S (w[26], w[27], selector); w[40] = __byte_perm_S (w[25], w[26], selector); w[39] = __byte_perm_S (w[24], w[25], selector); w[38] = __byte_perm_S (w[23], w[24], selector); w[37] = __byte_perm_S (w[22], w[23], selector); w[36] = __byte_perm_S (w[21], w[22], selector); w[35] = __byte_perm_S (w[20], w[21], selector); w[34] = __byte_perm_S (w[19], w[20], selector); w[33] = __byte_perm_S (w[18], w[19], selector); w[32] = __byte_perm_S (w[17], w[18], selector); w[31] = __byte_perm_S (w[16], w[17], selector); w[30] = __byte_perm_S (w[15], w[16], selector); w[29] = __byte_perm_S (w[14], w[15], selector); w[28] = __byte_perm_S (w[13], w[14], selector); w[27] = __byte_perm_S (w[12], w[13], selector); w[26] = __byte_perm_S (w[11], w[12], selector); w[25] = __byte_perm_S (w[10], w[11], selector); w[24] = __byte_perm_S (w[ 9], w[10], selector); w[23] = __byte_perm_S (w[ 8], w[ 9], selector); w[22] = __byte_perm_S (w[ 7], w[ 8], selector); w[21] = __byte_perm_S (w[ 6], w[ 7], selector); w[20] = __byte_perm_S (w[ 5], w[ 6], selector); w[19] = __byte_perm_S (w[ 4], w[ 5], selector); w[18] = __byte_perm_S (w[ 3], w[ 4], selector); w[17] = __byte_perm_S (w[ 2], w[ 3], selector); w[16] = __byte_perm_S (w[ 1], w[ 2], selector); w[15] = __byte_perm_S (w[ 0], w[ 1], selector); w[14] = __byte_perm_S ( 0, w[ 0], selector); w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 15: w[63] = __byte_perm_S (w[47], w[48], selector); w[62] = __byte_perm_S (w[46], w[47], selector); w[61] = __byte_perm_S (w[45], w[46], selector); w[60] = __byte_perm_S (w[44], w[45], selector); w[59] = __byte_perm_S (w[43], w[44], selector); w[58] = __byte_perm_S (w[42], w[43], selector); w[57] = __byte_perm_S (w[41], w[42], selector); w[56] = __byte_perm_S (w[40], w[41], selector); w[55] = __byte_perm_S (w[39], w[40], selector); w[54] = __byte_perm_S (w[38], w[39], selector); w[53] = __byte_perm_S (w[37], w[38], selector); w[52] = __byte_perm_S (w[36], w[37], selector); w[51] = __byte_perm_S (w[35], w[36], selector); w[50] = __byte_perm_S (w[34], w[35], selector); w[49] = __byte_perm_S (w[33], w[34], selector); w[48] = __byte_perm_S (w[32], w[33], selector); w[47] = __byte_perm_S (w[31], w[32], selector); w[46] = __byte_perm_S (w[30], w[31], selector); w[45] = __byte_perm_S (w[29], w[30], selector); w[44] = __byte_perm_S (w[28], w[29], selector); w[43] = __byte_perm_S (w[27], w[28], selector); w[42] = __byte_perm_S (w[26], w[27], selector); w[41] = __byte_perm_S (w[25], w[26], selector); w[40] = __byte_perm_S (w[24], w[25], selector); w[39] = __byte_perm_S (w[23], w[24], selector); w[38] = __byte_perm_S (w[22], w[23], selector); w[37] = __byte_perm_S (w[21], w[22], selector); w[36] = __byte_perm_S (w[20], w[21], selector); w[35] = __byte_perm_S (w[19], w[20], selector); w[34] = __byte_perm_S (w[18], w[19], selector); w[33] = __byte_perm_S (w[17], w[18], selector); w[32] = __byte_perm_S (w[16], w[17], selector); w[31] = __byte_perm_S (w[15], w[16], selector); w[30] = __byte_perm_S (w[14], w[15], selector); w[29] = __byte_perm_S (w[13], w[14], selector); w[28] = __byte_perm_S (w[12], w[13], selector); w[27] = __byte_perm_S (w[11], w[12], selector); w[26] = __byte_perm_S (w[10], w[11], selector); w[25] = __byte_perm_S (w[ 9], w[10], selector); w[24] = __byte_perm_S (w[ 8], w[ 9], selector); w[23] = __byte_perm_S (w[ 7], w[ 8], selector); w[22] = __byte_perm_S (w[ 6], w[ 7], selector); w[21] = __byte_perm_S (w[ 5], w[ 6], selector); w[20] = __byte_perm_S (w[ 4], w[ 5], selector); w[19] = __byte_perm_S (w[ 3], w[ 4], selector); w[18] = __byte_perm_S (w[ 2], w[ 3], selector); w[17] = __byte_perm_S (w[ 1], w[ 2], selector); w[16] = __byte_perm_S (w[ 0], w[ 1], selector); w[15] = __byte_perm_S ( 0, w[ 0], selector); w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 16: w[63] = __byte_perm_S (w[46], w[47], selector); w[62] = __byte_perm_S (w[45], w[46], selector); w[61] = __byte_perm_S (w[44], w[45], selector); w[60] = __byte_perm_S (w[43], w[44], selector); w[59] = __byte_perm_S (w[42], w[43], selector); w[58] = __byte_perm_S (w[41], w[42], selector); w[57] = __byte_perm_S (w[40], w[41], selector); w[56] = __byte_perm_S (w[39], w[40], selector); w[55] = __byte_perm_S (w[38], w[39], selector); w[54] = __byte_perm_S (w[37], w[38], selector); w[53] = __byte_perm_S (w[36], w[37], selector); w[52] = __byte_perm_S (w[35], w[36], selector); w[51] = __byte_perm_S (w[34], w[35], selector); w[50] = __byte_perm_S (w[33], w[34], selector); w[49] = __byte_perm_S (w[32], w[33], selector); w[48] = __byte_perm_S (w[31], w[32], selector); w[47] = __byte_perm_S (w[30], w[31], selector); w[46] = __byte_perm_S (w[29], w[30], selector); w[45] = __byte_perm_S (w[28], w[29], selector); w[44] = __byte_perm_S (w[27], w[28], selector); w[43] = __byte_perm_S (w[26], w[27], selector); w[42] = __byte_perm_S (w[25], w[26], selector); w[41] = __byte_perm_S (w[24], w[25], selector); w[40] = __byte_perm_S (w[23], w[24], selector); w[39] = __byte_perm_S (w[22], w[23], selector); w[38] = __byte_perm_S (w[21], w[22], selector); w[37] = __byte_perm_S (w[20], w[21], selector); w[36] = __byte_perm_S (w[19], w[20], selector); w[35] = __byte_perm_S (w[18], w[19], selector); w[34] = __byte_perm_S (w[17], w[18], selector); w[33] = __byte_perm_S (w[16], w[17], selector); w[32] = __byte_perm_S (w[15], w[16], selector); w[31] = __byte_perm_S (w[14], w[15], selector); w[30] = __byte_perm_S (w[13], w[14], selector); w[29] = __byte_perm_S (w[12], w[13], selector); w[28] = __byte_perm_S (w[11], w[12], selector); w[27] = __byte_perm_S (w[10], w[11], selector); w[26] = __byte_perm_S (w[ 9], w[10], selector); w[25] = __byte_perm_S (w[ 8], w[ 9], selector); w[24] = __byte_perm_S (w[ 7], w[ 8], selector); w[23] = __byte_perm_S (w[ 6], w[ 7], selector); w[22] = __byte_perm_S (w[ 5], w[ 6], selector); w[21] = __byte_perm_S (w[ 4], w[ 5], selector); w[20] = __byte_perm_S (w[ 3], w[ 4], selector); w[19] = __byte_perm_S (w[ 2], w[ 3], selector); w[18] = __byte_perm_S (w[ 1], w[ 2], selector); w[17] = __byte_perm_S (w[ 0], w[ 1], selector); w[16] = __byte_perm_S ( 0, w[ 0], selector); w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 17: w[63] = __byte_perm_S (w[45], w[46], selector); w[62] = __byte_perm_S (w[44], w[45], selector); w[61] = __byte_perm_S (w[43], w[44], selector); w[60] = __byte_perm_S (w[42], w[43], selector); w[59] = __byte_perm_S (w[41], w[42], selector); w[58] = __byte_perm_S (w[40], w[41], selector); w[57] = __byte_perm_S (w[39], w[40], selector); w[56] = __byte_perm_S (w[38], w[39], selector); w[55] = __byte_perm_S (w[37], w[38], selector); w[54] = __byte_perm_S (w[36], w[37], selector); w[53] = __byte_perm_S (w[35], w[36], selector); w[52] = __byte_perm_S (w[34], w[35], selector); w[51] = __byte_perm_S (w[33], w[34], selector); w[50] = __byte_perm_S (w[32], w[33], selector); w[49] = __byte_perm_S (w[31], w[32], selector); w[48] = __byte_perm_S (w[30], w[31], selector); w[47] = __byte_perm_S (w[29], w[30], selector); w[46] = __byte_perm_S (w[28], w[29], selector); w[45] = __byte_perm_S (w[27], w[28], selector); w[44] = __byte_perm_S (w[26], w[27], selector); w[43] = __byte_perm_S (w[25], w[26], selector); w[42] = __byte_perm_S (w[24], w[25], selector); w[41] = __byte_perm_S (w[23], w[24], selector); w[40] = __byte_perm_S (w[22], w[23], selector); w[39] = __byte_perm_S (w[21], w[22], selector); w[38] = __byte_perm_S (w[20], w[21], selector); w[37] = __byte_perm_S (w[19], w[20], selector); w[36] = __byte_perm_S (w[18], w[19], selector); w[35] = __byte_perm_S (w[17], w[18], selector); w[34] = __byte_perm_S (w[16], w[17], selector); w[33] = __byte_perm_S (w[15], w[16], selector); w[32] = __byte_perm_S (w[14], w[15], selector); w[31] = __byte_perm_S (w[13], w[14], selector); w[30] = __byte_perm_S (w[12], w[13], selector); w[29] = __byte_perm_S (w[11], w[12], selector); w[28] = __byte_perm_S (w[10], w[11], selector); w[27] = __byte_perm_S (w[ 9], w[10], selector); w[26] = __byte_perm_S (w[ 8], w[ 9], selector); w[25] = __byte_perm_S (w[ 7], w[ 8], selector); w[24] = __byte_perm_S (w[ 6], w[ 7], selector); w[23] = __byte_perm_S (w[ 5], w[ 6], selector); w[22] = __byte_perm_S (w[ 4], w[ 5], selector); w[21] = __byte_perm_S (w[ 3], w[ 4], selector); w[20] = __byte_perm_S (w[ 2], w[ 3], selector); w[19] = __byte_perm_S (w[ 1], w[ 2], selector); w[18] = __byte_perm_S (w[ 0], w[ 1], selector); w[17] = __byte_perm_S ( 0, w[ 0], selector); w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 18: w[63] = __byte_perm_S (w[44], w[45], selector); w[62] = __byte_perm_S (w[43], w[44], selector); w[61] = __byte_perm_S (w[42], w[43], selector); w[60] = __byte_perm_S (w[41], w[42], selector); w[59] = __byte_perm_S (w[40], w[41], selector); w[58] = __byte_perm_S (w[39], w[40], selector); w[57] = __byte_perm_S (w[38], w[39], selector); w[56] = __byte_perm_S (w[37], w[38], selector); w[55] = __byte_perm_S (w[36], w[37], selector); w[54] = __byte_perm_S (w[35], w[36], selector); w[53] = __byte_perm_S (w[34], w[35], selector); w[52] = __byte_perm_S (w[33], w[34], selector); w[51] = __byte_perm_S (w[32], w[33], selector); w[50] = __byte_perm_S (w[31], w[32], selector); w[49] = __byte_perm_S (w[30], w[31], selector); w[48] = __byte_perm_S (w[29], w[30], selector); w[47] = __byte_perm_S (w[28], w[29], selector); w[46] = __byte_perm_S (w[27], w[28], selector); w[45] = __byte_perm_S (w[26], w[27], selector); w[44] = __byte_perm_S (w[25], w[26], selector); w[43] = __byte_perm_S (w[24], w[25], selector); w[42] = __byte_perm_S (w[23], w[24], selector); w[41] = __byte_perm_S (w[22], w[23], selector); w[40] = __byte_perm_S (w[21], w[22], selector); w[39] = __byte_perm_S (w[20], w[21], selector); w[38] = __byte_perm_S (w[19], w[20], selector); w[37] = __byte_perm_S (w[18], w[19], selector); w[36] = __byte_perm_S (w[17], w[18], selector); w[35] = __byte_perm_S (w[16], w[17], selector); w[34] = __byte_perm_S (w[15], w[16], selector); w[33] = __byte_perm_S (w[14], w[15], selector); w[32] = __byte_perm_S (w[13], w[14], selector); w[31] = __byte_perm_S (w[12], w[13], selector); w[30] = __byte_perm_S (w[11], w[12], selector); w[29] = __byte_perm_S (w[10], w[11], selector); w[28] = __byte_perm_S (w[ 9], w[10], selector); w[27] = __byte_perm_S (w[ 8], w[ 9], selector); w[26] = __byte_perm_S (w[ 7], w[ 8], selector); w[25] = __byte_perm_S (w[ 6], w[ 7], selector); w[24] = __byte_perm_S (w[ 5], w[ 6], selector); w[23] = __byte_perm_S (w[ 4], w[ 5], selector); w[22] = __byte_perm_S (w[ 3], w[ 4], selector); w[21] = __byte_perm_S (w[ 2], w[ 3], selector); w[20] = __byte_perm_S (w[ 1], w[ 2], selector); w[19] = __byte_perm_S (w[ 0], w[ 1], selector); w[18] = __byte_perm_S ( 0, w[ 0], selector); w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 19: w[63] = __byte_perm_S (w[43], w[44], selector); w[62] = __byte_perm_S (w[42], w[43], selector); w[61] = __byte_perm_S (w[41], w[42], selector); w[60] = __byte_perm_S (w[40], w[41], selector); w[59] = __byte_perm_S (w[39], w[40], selector); w[58] = __byte_perm_S (w[38], w[39], selector); w[57] = __byte_perm_S (w[37], w[38], selector); w[56] = __byte_perm_S (w[36], w[37], selector); w[55] = __byte_perm_S (w[35], w[36], selector); w[54] = __byte_perm_S (w[34], w[35], selector); w[53] = __byte_perm_S (w[33], w[34], selector); w[52] = __byte_perm_S (w[32], w[33], selector); w[51] = __byte_perm_S (w[31], w[32], selector); w[50] = __byte_perm_S (w[30], w[31], selector); w[49] = __byte_perm_S (w[29], w[30], selector); w[48] = __byte_perm_S (w[28], w[29], selector); w[47] = __byte_perm_S (w[27], w[28], selector); w[46] = __byte_perm_S (w[26], w[27], selector); w[45] = __byte_perm_S (w[25], w[26], selector); w[44] = __byte_perm_S (w[24], w[25], selector); w[43] = __byte_perm_S (w[23], w[24], selector); w[42] = __byte_perm_S (w[22], w[23], selector); w[41] = __byte_perm_S (w[21], w[22], selector); w[40] = __byte_perm_S (w[20], w[21], selector); w[39] = __byte_perm_S (w[19], w[20], selector); w[38] = __byte_perm_S (w[18], w[19], selector); w[37] = __byte_perm_S (w[17], w[18], selector); w[36] = __byte_perm_S (w[16], w[17], selector); w[35] = __byte_perm_S (w[15], w[16], selector); w[34] = __byte_perm_S (w[14], w[15], selector); w[33] = __byte_perm_S (w[13], w[14], selector); w[32] = __byte_perm_S (w[12], w[13], selector); w[31] = __byte_perm_S (w[11], w[12], selector); w[30] = __byte_perm_S (w[10], w[11], selector); w[29] = __byte_perm_S (w[ 9], w[10], selector); w[28] = __byte_perm_S (w[ 8], w[ 9], selector); w[27] = __byte_perm_S (w[ 7], w[ 8], selector); w[26] = __byte_perm_S (w[ 6], w[ 7], selector); w[25] = __byte_perm_S (w[ 5], w[ 6], selector); w[24] = __byte_perm_S (w[ 4], w[ 5], selector); w[23] = __byte_perm_S (w[ 3], w[ 4], selector); w[22] = __byte_perm_S (w[ 2], w[ 3], selector); w[21] = __byte_perm_S (w[ 1], w[ 2], selector); w[20] = __byte_perm_S (w[ 0], w[ 1], selector); w[19] = __byte_perm_S ( 0, w[ 0], selector); w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 20: w[63] = __byte_perm_S (w[42], w[43], selector); w[62] = __byte_perm_S (w[41], w[42], selector); w[61] = __byte_perm_S (w[40], w[41], selector); w[60] = __byte_perm_S (w[39], w[40], selector); w[59] = __byte_perm_S (w[38], w[39], selector); w[58] = __byte_perm_S (w[37], w[38], selector); w[57] = __byte_perm_S (w[36], w[37], selector); w[56] = __byte_perm_S (w[35], w[36], selector); w[55] = __byte_perm_S (w[34], w[35], selector); w[54] = __byte_perm_S (w[33], w[34], selector); w[53] = __byte_perm_S (w[32], w[33], selector); w[52] = __byte_perm_S (w[31], w[32], selector); w[51] = __byte_perm_S (w[30], w[31], selector); w[50] = __byte_perm_S (w[29], w[30], selector); w[49] = __byte_perm_S (w[28], w[29], selector); w[48] = __byte_perm_S (w[27], w[28], selector); w[47] = __byte_perm_S (w[26], w[27], selector); w[46] = __byte_perm_S (w[25], w[26], selector); w[45] = __byte_perm_S (w[24], w[25], selector); w[44] = __byte_perm_S (w[23], w[24], selector); w[43] = __byte_perm_S (w[22], w[23], selector); w[42] = __byte_perm_S (w[21], w[22], selector); w[41] = __byte_perm_S (w[20], w[21], selector); w[40] = __byte_perm_S (w[19], w[20], selector); w[39] = __byte_perm_S (w[18], w[19], selector); w[38] = __byte_perm_S (w[17], w[18], selector); w[37] = __byte_perm_S (w[16], w[17], selector); w[36] = __byte_perm_S (w[15], w[16], selector); w[35] = __byte_perm_S (w[14], w[15], selector); w[34] = __byte_perm_S (w[13], w[14], selector); w[33] = __byte_perm_S (w[12], w[13], selector); w[32] = __byte_perm_S (w[11], w[12], selector); w[31] = __byte_perm_S (w[10], w[11], selector); w[30] = __byte_perm_S (w[ 9], w[10], selector); w[29] = __byte_perm_S (w[ 8], w[ 9], selector); w[28] = __byte_perm_S (w[ 7], w[ 8], selector); w[27] = __byte_perm_S (w[ 6], w[ 7], selector); w[26] = __byte_perm_S (w[ 5], w[ 6], selector); w[25] = __byte_perm_S (w[ 4], w[ 5], selector); w[24] = __byte_perm_S (w[ 3], w[ 4], selector); w[23] = __byte_perm_S (w[ 2], w[ 3], selector); w[22] = __byte_perm_S (w[ 1], w[ 2], selector); w[21] = __byte_perm_S (w[ 0], w[ 1], selector); w[20] = __byte_perm_S ( 0, w[ 0], selector); w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 21: w[63] = __byte_perm_S (w[41], w[42], selector); w[62] = __byte_perm_S (w[40], w[41], selector); w[61] = __byte_perm_S (w[39], w[40], selector); w[60] = __byte_perm_S (w[38], w[39], selector); w[59] = __byte_perm_S (w[37], w[38], selector); w[58] = __byte_perm_S (w[36], w[37], selector); w[57] = __byte_perm_S (w[35], w[36], selector); w[56] = __byte_perm_S (w[34], w[35], selector); w[55] = __byte_perm_S (w[33], w[34], selector); w[54] = __byte_perm_S (w[32], w[33], selector); w[53] = __byte_perm_S (w[31], w[32], selector); w[52] = __byte_perm_S (w[30], w[31], selector); w[51] = __byte_perm_S (w[29], w[30], selector); w[50] = __byte_perm_S (w[28], w[29], selector); w[49] = __byte_perm_S (w[27], w[28], selector); w[48] = __byte_perm_S (w[26], w[27], selector); w[47] = __byte_perm_S (w[25], w[26], selector); w[46] = __byte_perm_S (w[24], w[25], selector); w[45] = __byte_perm_S (w[23], w[24], selector); w[44] = __byte_perm_S (w[22], w[23], selector); w[43] = __byte_perm_S (w[21], w[22], selector); w[42] = __byte_perm_S (w[20], w[21], selector); w[41] = __byte_perm_S (w[19], w[20], selector); w[40] = __byte_perm_S (w[18], w[19], selector); w[39] = __byte_perm_S (w[17], w[18], selector); w[38] = __byte_perm_S (w[16], w[17], selector); w[37] = __byte_perm_S (w[15], w[16], selector); w[36] = __byte_perm_S (w[14], w[15], selector); w[35] = __byte_perm_S (w[13], w[14], selector); w[34] = __byte_perm_S (w[12], w[13], selector); w[33] = __byte_perm_S (w[11], w[12], selector); w[32] = __byte_perm_S (w[10], w[11], selector); w[31] = __byte_perm_S (w[ 9], w[10], selector); w[30] = __byte_perm_S (w[ 8], w[ 9], selector); w[29] = __byte_perm_S (w[ 7], w[ 8], selector); w[28] = __byte_perm_S (w[ 6], w[ 7], selector); w[27] = __byte_perm_S (w[ 5], w[ 6], selector); w[26] = __byte_perm_S (w[ 4], w[ 5], selector); w[25] = __byte_perm_S (w[ 3], w[ 4], selector); w[24] = __byte_perm_S (w[ 2], w[ 3], selector); w[23] = __byte_perm_S (w[ 1], w[ 2], selector); w[22] = __byte_perm_S (w[ 0], w[ 1], selector); w[21] = __byte_perm_S ( 0, w[ 0], selector); w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 22: w[63] = __byte_perm_S (w[40], w[41], selector); w[62] = __byte_perm_S (w[39], w[40], selector); w[61] = __byte_perm_S (w[38], w[39], selector); w[60] = __byte_perm_S (w[37], w[38], selector); w[59] = __byte_perm_S (w[36], w[37], selector); w[58] = __byte_perm_S (w[35], w[36], selector); w[57] = __byte_perm_S (w[34], w[35], selector); w[56] = __byte_perm_S (w[33], w[34], selector); w[55] = __byte_perm_S (w[32], w[33], selector); w[54] = __byte_perm_S (w[31], w[32], selector); w[53] = __byte_perm_S (w[30], w[31], selector); w[52] = __byte_perm_S (w[29], w[30], selector); w[51] = __byte_perm_S (w[28], w[29], selector); w[50] = __byte_perm_S (w[27], w[28], selector); w[49] = __byte_perm_S (w[26], w[27], selector); w[48] = __byte_perm_S (w[25], w[26], selector); w[47] = __byte_perm_S (w[24], w[25], selector); w[46] = __byte_perm_S (w[23], w[24], selector); w[45] = __byte_perm_S (w[22], w[23], selector); w[44] = __byte_perm_S (w[21], w[22], selector); w[43] = __byte_perm_S (w[20], w[21], selector); w[42] = __byte_perm_S (w[19], w[20], selector); w[41] = __byte_perm_S (w[18], w[19], selector); w[40] = __byte_perm_S (w[17], w[18], selector); w[39] = __byte_perm_S (w[16], w[17], selector); w[38] = __byte_perm_S (w[15], w[16], selector); w[37] = __byte_perm_S (w[14], w[15], selector); w[36] = __byte_perm_S (w[13], w[14], selector); w[35] = __byte_perm_S (w[12], w[13], selector); w[34] = __byte_perm_S (w[11], w[12], selector); w[33] = __byte_perm_S (w[10], w[11], selector); w[32] = __byte_perm_S (w[ 9], w[10], selector); w[31] = __byte_perm_S (w[ 8], w[ 9], selector); w[30] = __byte_perm_S (w[ 7], w[ 8], selector); w[29] = __byte_perm_S (w[ 6], w[ 7], selector); w[28] = __byte_perm_S (w[ 5], w[ 6], selector); w[27] = __byte_perm_S (w[ 4], w[ 5], selector); w[26] = __byte_perm_S (w[ 3], w[ 4], selector); w[25] = __byte_perm_S (w[ 2], w[ 3], selector); w[24] = __byte_perm_S (w[ 1], w[ 2], selector); w[23] = __byte_perm_S (w[ 0], w[ 1], selector); w[22] = __byte_perm_S ( 0, w[ 0], selector); w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 23: w[63] = __byte_perm_S (w[39], w[40], selector); w[62] = __byte_perm_S (w[38], w[39], selector); w[61] = __byte_perm_S (w[37], w[38], selector); w[60] = __byte_perm_S (w[36], w[37], selector); w[59] = __byte_perm_S (w[35], w[36], selector); w[58] = __byte_perm_S (w[34], w[35], selector); w[57] = __byte_perm_S (w[33], w[34], selector); w[56] = __byte_perm_S (w[32], w[33], selector); w[55] = __byte_perm_S (w[31], w[32], selector); w[54] = __byte_perm_S (w[30], w[31], selector); w[53] = __byte_perm_S (w[29], w[30], selector); w[52] = __byte_perm_S (w[28], w[29], selector); w[51] = __byte_perm_S (w[27], w[28], selector); w[50] = __byte_perm_S (w[26], w[27], selector); w[49] = __byte_perm_S (w[25], w[26], selector); w[48] = __byte_perm_S (w[24], w[25], selector); w[47] = __byte_perm_S (w[23], w[24], selector); w[46] = __byte_perm_S (w[22], w[23], selector); w[45] = __byte_perm_S (w[21], w[22], selector); w[44] = __byte_perm_S (w[20], w[21], selector); w[43] = __byte_perm_S (w[19], w[20], selector); w[42] = __byte_perm_S (w[18], w[19], selector); w[41] = __byte_perm_S (w[17], w[18], selector); w[40] = __byte_perm_S (w[16], w[17], selector); w[39] = __byte_perm_S (w[15], w[16], selector); w[38] = __byte_perm_S (w[14], w[15], selector); w[37] = __byte_perm_S (w[13], w[14], selector); w[36] = __byte_perm_S (w[12], w[13], selector); w[35] = __byte_perm_S (w[11], w[12], selector); w[34] = __byte_perm_S (w[10], w[11], selector); w[33] = __byte_perm_S (w[ 9], w[10], selector); w[32] = __byte_perm_S (w[ 8], w[ 9], selector); w[31] = __byte_perm_S (w[ 7], w[ 8], selector); w[30] = __byte_perm_S (w[ 6], w[ 7], selector); w[29] = __byte_perm_S (w[ 5], w[ 6], selector); w[28] = __byte_perm_S (w[ 4], w[ 5], selector); w[27] = __byte_perm_S (w[ 3], w[ 4], selector); w[26] = __byte_perm_S (w[ 2], w[ 3], selector); w[25] = __byte_perm_S (w[ 1], w[ 2], selector); w[24] = __byte_perm_S (w[ 0], w[ 1], selector); w[23] = __byte_perm_S ( 0, w[ 0], selector); w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 24: w[63] = __byte_perm_S (w[38], w[39], selector); w[62] = __byte_perm_S (w[37], w[38], selector); w[61] = __byte_perm_S (w[36], w[37], selector); w[60] = __byte_perm_S (w[35], w[36], selector); w[59] = __byte_perm_S (w[34], w[35], selector); w[58] = __byte_perm_S (w[33], w[34], selector); w[57] = __byte_perm_S (w[32], w[33], selector); w[56] = __byte_perm_S (w[31], w[32], selector); w[55] = __byte_perm_S (w[30], w[31], selector); w[54] = __byte_perm_S (w[29], w[30], selector); w[53] = __byte_perm_S (w[28], w[29], selector); w[52] = __byte_perm_S (w[27], w[28], selector); w[51] = __byte_perm_S (w[26], w[27], selector); w[50] = __byte_perm_S (w[25], w[26], selector); w[49] = __byte_perm_S (w[24], w[25], selector); w[48] = __byte_perm_S (w[23], w[24], selector); w[47] = __byte_perm_S (w[22], w[23], selector); w[46] = __byte_perm_S (w[21], w[22], selector); w[45] = __byte_perm_S (w[20], w[21], selector); w[44] = __byte_perm_S (w[19], w[20], selector); w[43] = __byte_perm_S (w[18], w[19], selector); w[42] = __byte_perm_S (w[17], w[18], selector); w[41] = __byte_perm_S (w[16], w[17], selector); w[40] = __byte_perm_S (w[15], w[16], selector); w[39] = __byte_perm_S (w[14], w[15], selector); w[38] = __byte_perm_S (w[13], w[14], selector); w[37] = __byte_perm_S (w[12], w[13], selector); w[36] = __byte_perm_S (w[11], w[12], selector); w[35] = __byte_perm_S (w[10], w[11], selector); w[34] = __byte_perm_S (w[ 9], w[10], selector); w[33] = __byte_perm_S (w[ 8], w[ 9], selector); w[32] = __byte_perm_S (w[ 7], w[ 8], selector); w[31] = __byte_perm_S (w[ 6], w[ 7], selector); w[30] = __byte_perm_S (w[ 5], w[ 6], selector); w[29] = __byte_perm_S (w[ 4], w[ 5], selector); w[28] = __byte_perm_S (w[ 3], w[ 4], selector); w[27] = __byte_perm_S (w[ 2], w[ 3], selector); w[26] = __byte_perm_S (w[ 1], w[ 2], selector); w[25] = __byte_perm_S (w[ 0], w[ 1], selector); w[24] = __byte_perm_S ( 0, w[ 0], selector); w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 25: w[63] = __byte_perm_S (w[37], w[38], selector); w[62] = __byte_perm_S (w[36], w[37], selector); w[61] = __byte_perm_S (w[35], w[36], selector); w[60] = __byte_perm_S (w[34], w[35], selector); w[59] = __byte_perm_S (w[33], w[34], selector); w[58] = __byte_perm_S (w[32], w[33], selector); w[57] = __byte_perm_S (w[31], w[32], selector); w[56] = __byte_perm_S (w[30], w[31], selector); w[55] = __byte_perm_S (w[29], w[30], selector); w[54] = __byte_perm_S (w[28], w[29], selector); w[53] = __byte_perm_S (w[27], w[28], selector); w[52] = __byte_perm_S (w[26], w[27], selector); w[51] = __byte_perm_S (w[25], w[26], selector); w[50] = __byte_perm_S (w[24], w[25], selector); w[49] = __byte_perm_S (w[23], w[24], selector); w[48] = __byte_perm_S (w[22], w[23], selector); w[47] = __byte_perm_S (w[21], w[22], selector); w[46] = __byte_perm_S (w[20], w[21], selector); w[45] = __byte_perm_S (w[19], w[20], selector); w[44] = __byte_perm_S (w[18], w[19], selector); w[43] = __byte_perm_S (w[17], w[18], selector); w[42] = __byte_perm_S (w[16], w[17], selector); w[41] = __byte_perm_S (w[15], w[16], selector); w[40] = __byte_perm_S (w[14], w[15], selector); w[39] = __byte_perm_S (w[13], w[14], selector); w[38] = __byte_perm_S (w[12], w[13], selector); w[37] = __byte_perm_S (w[11], w[12], selector); w[36] = __byte_perm_S (w[10], w[11], selector); w[35] = __byte_perm_S (w[ 9], w[10], selector); w[34] = __byte_perm_S (w[ 8], w[ 9], selector); w[33] = __byte_perm_S (w[ 7], w[ 8], selector); w[32] = __byte_perm_S (w[ 6], w[ 7], selector); w[31] = __byte_perm_S (w[ 5], w[ 6], selector); w[30] = __byte_perm_S (w[ 4], w[ 5], selector); w[29] = __byte_perm_S (w[ 3], w[ 4], selector); w[28] = __byte_perm_S (w[ 2], w[ 3], selector); w[27] = __byte_perm_S (w[ 1], w[ 2], selector); w[26] = __byte_perm_S (w[ 0], w[ 1], selector); w[25] = __byte_perm_S ( 0, w[ 0], selector); w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 26: w[63] = __byte_perm_S (w[36], w[37], selector); w[62] = __byte_perm_S (w[35], w[36], selector); w[61] = __byte_perm_S (w[34], w[35], selector); w[60] = __byte_perm_S (w[33], w[34], selector); w[59] = __byte_perm_S (w[32], w[33], selector); w[58] = __byte_perm_S (w[31], w[32], selector); w[57] = __byte_perm_S (w[30], w[31], selector); w[56] = __byte_perm_S (w[29], w[30], selector); w[55] = __byte_perm_S (w[28], w[29], selector); w[54] = __byte_perm_S (w[27], w[28], selector); w[53] = __byte_perm_S (w[26], w[27], selector); w[52] = __byte_perm_S (w[25], w[26], selector); w[51] = __byte_perm_S (w[24], w[25], selector); w[50] = __byte_perm_S (w[23], w[24], selector); w[49] = __byte_perm_S (w[22], w[23], selector); w[48] = __byte_perm_S (w[21], w[22], selector); w[47] = __byte_perm_S (w[20], w[21], selector); w[46] = __byte_perm_S (w[19], w[20], selector); w[45] = __byte_perm_S (w[18], w[19], selector); w[44] = __byte_perm_S (w[17], w[18], selector); w[43] = __byte_perm_S (w[16], w[17], selector); w[42] = __byte_perm_S (w[15], w[16], selector); w[41] = __byte_perm_S (w[14], w[15], selector); w[40] = __byte_perm_S (w[13], w[14], selector); w[39] = __byte_perm_S (w[12], w[13], selector); w[38] = __byte_perm_S (w[11], w[12], selector); w[37] = __byte_perm_S (w[10], w[11], selector); w[36] = __byte_perm_S (w[ 9], w[10], selector); w[35] = __byte_perm_S (w[ 8], w[ 9], selector); w[34] = __byte_perm_S (w[ 7], w[ 8], selector); w[33] = __byte_perm_S (w[ 6], w[ 7], selector); w[32] = __byte_perm_S (w[ 5], w[ 6], selector); w[31] = __byte_perm_S (w[ 4], w[ 5], selector); w[30] = __byte_perm_S (w[ 3], w[ 4], selector); w[29] = __byte_perm_S (w[ 2], w[ 3], selector); w[28] = __byte_perm_S (w[ 1], w[ 2], selector); w[27] = __byte_perm_S (w[ 0], w[ 1], selector); w[26] = __byte_perm_S ( 0, w[ 0], selector); w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 27: w[63] = __byte_perm_S (w[35], w[36], selector); w[62] = __byte_perm_S (w[34], w[35], selector); w[61] = __byte_perm_S (w[33], w[34], selector); w[60] = __byte_perm_S (w[32], w[33], selector); w[59] = __byte_perm_S (w[31], w[32], selector); w[58] = __byte_perm_S (w[30], w[31], selector); w[57] = __byte_perm_S (w[29], w[30], selector); w[56] = __byte_perm_S (w[28], w[29], selector); w[55] = __byte_perm_S (w[27], w[28], selector); w[54] = __byte_perm_S (w[26], w[27], selector); w[53] = __byte_perm_S (w[25], w[26], selector); w[52] = __byte_perm_S (w[24], w[25], selector); w[51] = __byte_perm_S (w[23], w[24], selector); w[50] = __byte_perm_S (w[22], w[23], selector); w[49] = __byte_perm_S (w[21], w[22], selector); w[48] = __byte_perm_S (w[20], w[21], selector); w[47] = __byte_perm_S (w[19], w[20], selector); w[46] = __byte_perm_S (w[18], w[19], selector); w[45] = __byte_perm_S (w[17], w[18], selector); w[44] = __byte_perm_S (w[16], w[17], selector); w[43] = __byte_perm_S (w[15], w[16], selector); w[42] = __byte_perm_S (w[14], w[15], selector); w[41] = __byte_perm_S (w[13], w[14], selector); w[40] = __byte_perm_S (w[12], w[13], selector); w[39] = __byte_perm_S (w[11], w[12], selector); w[38] = __byte_perm_S (w[10], w[11], selector); w[37] = __byte_perm_S (w[ 9], w[10], selector); w[36] = __byte_perm_S (w[ 8], w[ 9], selector); w[35] = __byte_perm_S (w[ 7], w[ 8], selector); w[34] = __byte_perm_S (w[ 6], w[ 7], selector); w[33] = __byte_perm_S (w[ 5], w[ 6], selector); w[32] = __byte_perm_S (w[ 4], w[ 5], selector); w[31] = __byte_perm_S (w[ 3], w[ 4], selector); w[30] = __byte_perm_S (w[ 2], w[ 3], selector); w[29] = __byte_perm_S (w[ 1], w[ 2], selector); w[28] = __byte_perm_S (w[ 0], w[ 1], selector); w[27] = __byte_perm_S ( 0, w[ 0], selector); w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 28: w[63] = __byte_perm_S (w[34], w[35], selector); w[62] = __byte_perm_S (w[33], w[34], selector); w[61] = __byte_perm_S (w[32], w[33], selector); w[60] = __byte_perm_S (w[31], w[32], selector); w[59] = __byte_perm_S (w[30], w[31], selector); w[58] = __byte_perm_S (w[29], w[30], selector); w[57] = __byte_perm_S (w[28], w[29], selector); w[56] = __byte_perm_S (w[27], w[28], selector); w[55] = __byte_perm_S (w[26], w[27], selector); w[54] = __byte_perm_S (w[25], w[26], selector); w[53] = __byte_perm_S (w[24], w[25], selector); w[52] = __byte_perm_S (w[23], w[24], selector); w[51] = __byte_perm_S (w[22], w[23], selector); w[50] = __byte_perm_S (w[21], w[22], selector); w[49] = __byte_perm_S (w[20], w[21], selector); w[48] = __byte_perm_S (w[19], w[20], selector); w[47] = __byte_perm_S (w[18], w[19], selector); w[46] = __byte_perm_S (w[17], w[18], selector); w[45] = __byte_perm_S (w[16], w[17], selector); w[44] = __byte_perm_S (w[15], w[16], selector); w[43] = __byte_perm_S (w[14], w[15], selector); w[42] = __byte_perm_S (w[13], w[14], selector); w[41] = __byte_perm_S (w[12], w[13], selector); w[40] = __byte_perm_S (w[11], w[12], selector); w[39] = __byte_perm_S (w[10], w[11], selector); w[38] = __byte_perm_S (w[ 9], w[10], selector); w[37] = __byte_perm_S (w[ 8], w[ 9], selector); w[36] = __byte_perm_S (w[ 7], w[ 8], selector); w[35] = __byte_perm_S (w[ 6], w[ 7], selector); w[34] = __byte_perm_S (w[ 5], w[ 6], selector); w[33] = __byte_perm_S (w[ 4], w[ 5], selector); w[32] = __byte_perm_S (w[ 3], w[ 4], selector); w[31] = __byte_perm_S (w[ 2], w[ 3], selector); w[30] = __byte_perm_S (w[ 1], w[ 2], selector); w[29] = __byte_perm_S (w[ 0], w[ 1], selector); w[28] = __byte_perm_S ( 0, w[ 0], selector); w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 29: w[63] = __byte_perm_S (w[33], w[34], selector); w[62] = __byte_perm_S (w[32], w[33], selector); w[61] = __byte_perm_S (w[31], w[32], selector); w[60] = __byte_perm_S (w[30], w[31], selector); w[59] = __byte_perm_S (w[29], w[30], selector); w[58] = __byte_perm_S (w[28], w[29], selector); w[57] = __byte_perm_S (w[27], w[28], selector); w[56] = __byte_perm_S (w[26], w[27], selector); w[55] = __byte_perm_S (w[25], w[26], selector); w[54] = __byte_perm_S (w[24], w[25], selector); w[53] = __byte_perm_S (w[23], w[24], selector); w[52] = __byte_perm_S (w[22], w[23], selector); w[51] = __byte_perm_S (w[21], w[22], selector); w[50] = __byte_perm_S (w[20], w[21], selector); w[49] = __byte_perm_S (w[19], w[20], selector); w[48] = __byte_perm_S (w[18], w[19], selector); w[47] = __byte_perm_S (w[17], w[18], selector); w[46] = __byte_perm_S (w[16], w[17], selector); w[45] = __byte_perm_S (w[15], w[16], selector); w[44] = __byte_perm_S (w[14], w[15], selector); w[43] = __byte_perm_S (w[13], w[14], selector); w[42] = __byte_perm_S (w[12], w[13], selector); w[41] = __byte_perm_S (w[11], w[12], selector); w[40] = __byte_perm_S (w[10], w[11], selector); w[39] = __byte_perm_S (w[ 9], w[10], selector); w[38] = __byte_perm_S (w[ 8], w[ 9], selector); w[37] = __byte_perm_S (w[ 7], w[ 8], selector); w[36] = __byte_perm_S (w[ 6], w[ 7], selector); w[35] = __byte_perm_S (w[ 5], w[ 6], selector); w[34] = __byte_perm_S (w[ 4], w[ 5], selector); w[33] = __byte_perm_S (w[ 3], w[ 4], selector); w[32] = __byte_perm_S (w[ 2], w[ 3], selector); w[31] = __byte_perm_S (w[ 1], w[ 2], selector); w[30] = __byte_perm_S (w[ 0], w[ 1], selector); w[29] = __byte_perm_S ( 0, w[ 0], selector); w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 30: w[63] = __byte_perm_S (w[32], w[33], selector); w[62] = __byte_perm_S (w[31], w[32], selector); w[61] = __byte_perm_S (w[30], w[31], selector); w[60] = __byte_perm_S (w[29], w[30], selector); w[59] = __byte_perm_S (w[28], w[29], selector); w[58] = __byte_perm_S (w[27], w[28], selector); w[57] = __byte_perm_S (w[26], w[27], selector); w[56] = __byte_perm_S (w[25], w[26], selector); w[55] = __byte_perm_S (w[24], w[25], selector); w[54] = __byte_perm_S (w[23], w[24], selector); w[53] = __byte_perm_S (w[22], w[23], selector); w[52] = __byte_perm_S (w[21], w[22], selector); w[51] = __byte_perm_S (w[20], w[21], selector); w[50] = __byte_perm_S (w[19], w[20], selector); w[49] = __byte_perm_S (w[18], w[19], selector); w[48] = __byte_perm_S (w[17], w[18], selector); w[47] = __byte_perm_S (w[16], w[17], selector); w[46] = __byte_perm_S (w[15], w[16], selector); w[45] = __byte_perm_S (w[14], w[15], selector); w[44] = __byte_perm_S (w[13], w[14], selector); w[43] = __byte_perm_S (w[12], w[13], selector); w[42] = __byte_perm_S (w[11], w[12], selector); w[41] = __byte_perm_S (w[10], w[11], selector); w[40] = __byte_perm_S (w[ 9], w[10], selector); w[39] = __byte_perm_S (w[ 8], w[ 9], selector); w[38] = __byte_perm_S (w[ 7], w[ 8], selector); w[37] = __byte_perm_S (w[ 6], w[ 7], selector); w[36] = __byte_perm_S (w[ 5], w[ 6], selector); w[35] = __byte_perm_S (w[ 4], w[ 5], selector); w[34] = __byte_perm_S (w[ 3], w[ 4], selector); w[33] = __byte_perm_S (w[ 2], w[ 3], selector); w[32] = __byte_perm_S (w[ 1], w[ 2], selector); w[31] = __byte_perm_S (w[ 0], w[ 1], selector); w[30] = __byte_perm_S ( 0, w[ 0], selector); w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 31: w[63] = __byte_perm_S (w[31], w[32], selector); w[62] = __byte_perm_S (w[30], w[31], selector); w[61] = __byte_perm_S (w[29], w[30], selector); w[60] = __byte_perm_S (w[28], w[29], selector); w[59] = __byte_perm_S (w[27], w[28], selector); w[58] = __byte_perm_S (w[26], w[27], selector); w[57] = __byte_perm_S (w[25], w[26], selector); w[56] = __byte_perm_S (w[24], w[25], selector); w[55] = __byte_perm_S (w[23], w[24], selector); w[54] = __byte_perm_S (w[22], w[23], selector); w[53] = __byte_perm_S (w[21], w[22], selector); w[52] = __byte_perm_S (w[20], w[21], selector); w[51] = __byte_perm_S (w[19], w[20], selector); w[50] = __byte_perm_S (w[18], w[19], selector); w[49] = __byte_perm_S (w[17], w[18], selector); w[48] = __byte_perm_S (w[16], w[17], selector); w[47] = __byte_perm_S (w[15], w[16], selector); w[46] = __byte_perm_S (w[14], w[15], selector); w[45] = __byte_perm_S (w[13], w[14], selector); w[44] = __byte_perm_S (w[12], w[13], selector); w[43] = __byte_perm_S (w[11], w[12], selector); w[42] = __byte_perm_S (w[10], w[11], selector); w[41] = __byte_perm_S (w[ 9], w[10], selector); w[40] = __byte_perm_S (w[ 8], w[ 9], selector); w[39] = __byte_perm_S (w[ 7], w[ 8], selector); w[38] = __byte_perm_S (w[ 6], w[ 7], selector); w[37] = __byte_perm_S (w[ 5], w[ 6], selector); w[36] = __byte_perm_S (w[ 4], w[ 5], selector); w[35] = __byte_perm_S (w[ 3], w[ 4], selector); w[34] = __byte_perm_S (w[ 2], w[ 3], selector); w[33] = __byte_perm_S (w[ 1], w[ 2], selector); w[32] = __byte_perm_S (w[ 0], w[ 1], selector); w[31] = __byte_perm_S ( 0, w[ 0], selector); w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 32: w[63] = __byte_perm_S (w[30], w[31], selector); w[62] = __byte_perm_S (w[29], w[30], selector); w[61] = __byte_perm_S (w[28], w[29], selector); w[60] = __byte_perm_S (w[27], w[28], selector); w[59] = __byte_perm_S (w[26], w[27], selector); w[58] = __byte_perm_S (w[25], w[26], selector); w[57] = __byte_perm_S (w[24], w[25], selector); w[56] = __byte_perm_S (w[23], w[24], selector); w[55] = __byte_perm_S (w[22], w[23], selector); w[54] = __byte_perm_S (w[21], w[22], selector); w[53] = __byte_perm_S (w[20], w[21], selector); w[52] = __byte_perm_S (w[19], w[20], selector); w[51] = __byte_perm_S (w[18], w[19], selector); w[50] = __byte_perm_S (w[17], w[18], selector); w[49] = __byte_perm_S (w[16], w[17], selector); w[48] = __byte_perm_S (w[15], w[16], selector); w[47] = __byte_perm_S (w[14], w[15], selector); w[46] = __byte_perm_S (w[13], w[14], selector); w[45] = __byte_perm_S (w[12], w[13], selector); w[44] = __byte_perm_S (w[11], w[12], selector); w[43] = __byte_perm_S (w[10], w[11], selector); w[42] = __byte_perm_S (w[ 9], w[10], selector); w[41] = __byte_perm_S (w[ 8], w[ 9], selector); w[40] = __byte_perm_S (w[ 7], w[ 8], selector); w[39] = __byte_perm_S (w[ 6], w[ 7], selector); w[38] = __byte_perm_S (w[ 5], w[ 6], selector); w[37] = __byte_perm_S (w[ 4], w[ 5], selector); w[36] = __byte_perm_S (w[ 3], w[ 4], selector); w[35] = __byte_perm_S (w[ 2], w[ 3], selector); w[34] = __byte_perm_S (w[ 1], w[ 2], selector); w[33] = __byte_perm_S (w[ 0], w[ 1], selector); w[32] = __byte_perm_S ( 0, w[ 0], selector); w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 33: w[63] = __byte_perm_S (w[29], w[30], selector); w[62] = __byte_perm_S (w[28], w[29], selector); w[61] = __byte_perm_S (w[27], w[28], selector); w[60] = __byte_perm_S (w[26], w[27], selector); w[59] = __byte_perm_S (w[25], w[26], selector); w[58] = __byte_perm_S (w[24], w[25], selector); w[57] = __byte_perm_S (w[23], w[24], selector); w[56] = __byte_perm_S (w[22], w[23], selector); w[55] = __byte_perm_S (w[21], w[22], selector); w[54] = __byte_perm_S (w[20], w[21], selector); w[53] = __byte_perm_S (w[19], w[20], selector); w[52] = __byte_perm_S (w[18], w[19], selector); w[51] = __byte_perm_S (w[17], w[18], selector); w[50] = __byte_perm_S (w[16], w[17], selector); w[49] = __byte_perm_S (w[15], w[16], selector); w[48] = __byte_perm_S (w[14], w[15], selector); w[47] = __byte_perm_S (w[13], w[14], selector); w[46] = __byte_perm_S (w[12], w[13], selector); w[45] = __byte_perm_S (w[11], w[12], selector); w[44] = __byte_perm_S (w[10], w[11], selector); w[43] = __byte_perm_S (w[ 9], w[10], selector); w[42] = __byte_perm_S (w[ 8], w[ 9], selector); w[41] = __byte_perm_S (w[ 7], w[ 8], selector); w[40] = __byte_perm_S (w[ 6], w[ 7], selector); w[39] = __byte_perm_S (w[ 5], w[ 6], selector); w[38] = __byte_perm_S (w[ 4], w[ 5], selector); w[37] = __byte_perm_S (w[ 3], w[ 4], selector); w[36] = __byte_perm_S (w[ 2], w[ 3], selector); w[35] = __byte_perm_S (w[ 1], w[ 2], selector); w[34] = __byte_perm_S (w[ 0], w[ 1], selector); w[33] = __byte_perm_S ( 0, w[ 0], selector); w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 34: w[63] = __byte_perm_S (w[28], w[29], selector); w[62] = __byte_perm_S (w[27], w[28], selector); w[61] = __byte_perm_S (w[26], w[27], selector); w[60] = __byte_perm_S (w[25], w[26], selector); w[59] = __byte_perm_S (w[24], w[25], selector); w[58] = __byte_perm_S (w[23], w[24], selector); w[57] = __byte_perm_S (w[22], w[23], selector); w[56] = __byte_perm_S (w[21], w[22], selector); w[55] = __byte_perm_S (w[20], w[21], selector); w[54] = __byte_perm_S (w[19], w[20], selector); w[53] = __byte_perm_S (w[18], w[19], selector); w[52] = __byte_perm_S (w[17], w[18], selector); w[51] = __byte_perm_S (w[16], w[17], selector); w[50] = __byte_perm_S (w[15], w[16], selector); w[49] = __byte_perm_S (w[14], w[15], selector); w[48] = __byte_perm_S (w[13], w[14], selector); w[47] = __byte_perm_S (w[12], w[13], selector); w[46] = __byte_perm_S (w[11], w[12], selector); w[45] = __byte_perm_S (w[10], w[11], selector); w[44] = __byte_perm_S (w[ 9], w[10], selector); w[43] = __byte_perm_S (w[ 8], w[ 9], selector); w[42] = __byte_perm_S (w[ 7], w[ 8], selector); w[41] = __byte_perm_S (w[ 6], w[ 7], selector); w[40] = __byte_perm_S (w[ 5], w[ 6], selector); w[39] = __byte_perm_S (w[ 4], w[ 5], selector); w[38] = __byte_perm_S (w[ 3], w[ 4], selector); w[37] = __byte_perm_S (w[ 2], w[ 3], selector); w[36] = __byte_perm_S (w[ 1], w[ 2], selector); w[35] = __byte_perm_S (w[ 0], w[ 1], selector); w[34] = __byte_perm_S ( 0, w[ 0], selector); w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 35: w[63] = __byte_perm_S (w[27], w[28], selector); w[62] = __byte_perm_S (w[26], w[27], selector); w[61] = __byte_perm_S (w[25], w[26], selector); w[60] = __byte_perm_S (w[24], w[25], selector); w[59] = __byte_perm_S (w[23], w[24], selector); w[58] = __byte_perm_S (w[22], w[23], selector); w[57] = __byte_perm_S (w[21], w[22], selector); w[56] = __byte_perm_S (w[20], w[21], selector); w[55] = __byte_perm_S (w[19], w[20], selector); w[54] = __byte_perm_S (w[18], w[19], selector); w[53] = __byte_perm_S (w[17], w[18], selector); w[52] = __byte_perm_S (w[16], w[17], selector); w[51] = __byte_perm_S (w[15], w[16], selector); w[50] = __byte_perm_S (w[14], w[15], selector); w[49] = __byte_perm_S (w[13], w[14], selector); w[48] = __byte_perm_S (w[12], w[13], selector); w[47] = __byte_perm_S (w[11], w[12], selector); w[46] = __byte_perm_S (w[10], w[11], selector); w[45] = __byte_perm_S (w[ 9], w[10], selector); w[44] = __byte_perm_S (w[ 8], w[ 9], selector); w[43] = __byte_perm_S (w[ 7], w[ 8], selector); w[42] = __byte_perm_S (w[ 6], w[ 7], selector); w[41] = __byte_perm_S (w[ 5], w[ 6], selector); w[40] = __byte_perm_S (w[ 4], w[ 5], selector); w[39] = __byte_perm_S (w[ 3], w[ 4], selector); w[38] = __byte_perm_S (w[ 2], w[ 3], selector); w[37] = __byte_perm_S (w[ 1], w[ 2], selector); w[36] = __byte_perm_S (w[ 0], w[ 1], selector); w[35] = __byte_perm_S ( 0, w[ 0], selector); w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 36: w[63] = __byte_perm_S (w[26], w[27], selector); w[62] = __byte_perm_S (w[25], w[26], selector); w[61] = __byte_perm_S (w[24], w[25], selector); w[60] = __byte_perm_S (w[23], w[24], selector); w[59] = __byte_perm_S (w[22], w[23], selector); w[58] = __byte_perm_S (w[21], w[22], selector); w[57] = __byte_perm_S (w[20], w[21], selector); w[56] = __byte_perm_S (w[19], w[20], selector); w[55] = __byte_perm_S (w[18], w[19], selector); w[54] = __byte_perm_S (w[17], w[18], selector); w[53] = __byte_perm_S (w[16], w[17], selector); w[52] = __byte_perm_S (w[15], w[16], selector); w[51] = __byte_perm_S (w[14], w[15], selector); w[50] = __byte_perm_S (w[13], w[14], selector); w[49] = __byte_perm_S (w[12], w[13], selector); w[48] = __byte_perm_S (w[11], w[12], selector); w[47] = __byte_perm_S (w[10], w[11], selector); w[46] = __byte_perm_S (w[ 9], w[10], selector); w[45] = __byte_perm_S (w[ 8], w[ 9], selector); w[44] = __byte_perm_S (w[ 7], w[ 8], selector); w[43] = __byte_perm_S (w[ 6], w[ 7], selector); w[42] = __byte_perm_S (w[ 5], w[ 6], selector); w[41] = __byte_perm_S (w[ 4], w[ 5], selector); w[40] = __byte_perm_S (w[ 3], w[ 4], selector); w[39] = __byte_perm_S (w[ 2], w[ 3], selector); w[38] = __byte_perm_S (w[ 1], w[ 2], selector); w[37] = __byte_perm_S (w[ 0], w[ 1], selector); w[36] = __byte_perm_S ( 0, w[ 0], selector); w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 37: w[63] = __byte_perm_S (w[25], w[26], selector); w[62] = __byte_perm_S (w[24], w[25], selector); w[61] = __byte_perm_S (w[23], w[24], selector); w[60] = __byte_perm_S (w[22], w[23], selector); w[59] = __byte_perm_S (w[21], w[22], selector); w[58] = __byte_perm_S (w[20], w[21], selector); w[57] = __byte_perm_S (w[19], w[20], selector); w[56] = __byte_perm_S (w[18], w[19], selector); w[55] = __byte_perm_S (w[17], w[18], selector); w[54] = __byte_perm_S (w[16], w[17], selector); w[53] = __byte_perm_S (w[15], w[16], selector); w[52] = __byte_perm_S (w[14], w[15], selector); w[51] = __byte_perm_S (w[13], w[14], selector); w[50] = __byte_perm_S (w[12], w[13], selector); w[49] = __byte_perm_S (w[11], w[12], selector); w[48] = __byte_perm_S (w[10], w[11], selector); w[47] = __byte_perm_S (w[ 9], w[10], selector); w[46] = __byte_perm_S (w[ 8], w[ 9], selector); w[45] = __byte_perm_S (w[ 7], w[ 8], selector); w[44] = __byte_perm_S (w[ 6], w[ 7], selector); w[43] = __byte_perm_S (w[ 5], w[ 6], selector); w[42] = __byte_perm_S (w[ 4], w[ 5], selector); w[41] = __byte_perm_S (w[ 3], w[ 4], selector); w[40] = __byte_perm_S (w[ 2], w[ 3], selector); w[39] = __byte_perm_S (w[ 1], w[ 2], selector); w[38] = __byte_perm_S (w[ 0], w[ 1], selector); w[37] = __byte_perm_S ( 0, w[ 0], selector); w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 38: w[63] = __byte_perm_S (w[24], w[25], selector); w[62] = __byte_perm_S (w[23], w[24], selector); w[61] = __byte_perm_S (w[22], w[23], selector); w[60] = __byte_perm_S (w[21], w[22], selector); w[59] = __byte_perm_S (w[20], w[21], selector); w[58] = __byte_perm_S (w[19], w[20], selector); w[57] = __byte_perm_S (w[18], w[19], selector); w[56] = __byte_perm_S (w[17], w[18], selector); w[55] = __byte_perm_S (w[16], w[17], selector); w[54] = __byte_perm_S (w[15], w[16], selector); w[53] = __byte_perm_S (w[14], w[15], selector); w[52] = __byte_perm_S (w[13], w[14], selector); w[51] = __byte_perm_S (w[12], w[13], selector); w[50] = __byte_perm_S (w[11], w[12], selector); w[49] = __byte_perm_S (w[10], w[11], selector); w[48] = __byte_perm_S (w[ 9], w[10], selector); w[47] = __byte_perm_S (w[ 8], w[ 9], selector); w[46] = __byte_perm_S (w[ 7], w[ 8], selector); w[45] = __byte_perm_S (w[ 6], w[ 7], selector); w[44] = __byte_perm_S (w[ 5], w[ 6], selector); w[43] = __byte_perm_S (w[ 4], w[ 5], selector); w[42] = __byte_perm_S (w[ 3], w[ 4], selector); w[41] = __byte_perm_S (w[ 2], w[ 3], selector); w[40] = __byte_perm_S (w[ 1], w[ 2], selector); w[39] = __byte_perm_S (w[ 0], w[ 1], selector); w[38] = __byte_perm_S ( 0, w[ 0], selector); w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 39: w[63] = __byte_perm_S (w[23], w[24], selector); w[62] = __byte_perm_S (w[22], w[23], selector); w[61] = __byte_perm_S (w[21], w[22], selector); w[60] = __byte_perm_S (w[20], w[21], selector); w[59] = __byte_perm_S (w[19], w[20], selector); w[58] = __byte_perm_S (w[18], w[19], selector); w[57] = __byte_perm_S (w[17], w[18], selector); w[56] = __byte_perm_S (w[16], w[17], selector); w[55] = __byte_perm_S (w[15], w[16], selector); w[54] = __byte_perm_S (w[14], w[15], selector); w[53] = __byte_perm_S (w[13], w[14], selector); w[52] = __byte_perm_S (w[12], w[13], selector); w[51] = __byte_perm_S (w[11], w[12], selector); w[50] = __byte_perm_S (w[10], w[11], selector); w[49] = __byte_perm_S (w[ 9], w[10], selector); w[48] = __byte_perm_S (w[ 8], w[ 9], selector); w[47] = __byte_perm_S (w[ 7], w[ 8], selector); w[46] = __byte_perm_S (w[ 6], w[ 7], selector); w[45] = __byte_perm_S (w[ 5], w[ 6], selector); w[44] = __byte_perm_S (w[ 4], w[ 5], selector); w[43] = __byte_perm_S (w[ 3], w[ 4], selector); w[42] = __byte_perm_S (w[ 2], w[ 3], selector); w[41] = __byte_perm_S (w[ 1], w[ 2], selector); w[40] = __byte_perm_S (w[ 0], w[ 1], selector); w[39] = __byte_perm_S ( 0, w[ 0], selector); w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 40: w[63] = __byte_perm_S (w[22], w[23], selector); w[62] = __byte_perm_S (w[21], w[22], selector); w[61] = __byte_perm_S (w[20], w[21], selector); w[60] = __byte_perm_S (w[19], w[20], selector); w[59] = __byte_perm_S (w[18], w[19], selector); w[58] = __byte_perm_S (w[17], w[18], selector); w[57] = __byte_perm_S (w[16], w[17], selector); w[56] = __byte_perm_S (w[15], w[16], selector); w[55] = __byte_perm_S (w[14], w[15], selector); w[54] = __byte_perm_S (w[13], w[14], selector); w[53] = __byte_perm_S (w[12], w[13], selector); w[52] = __byte_perm_S (w[11], w[12], selector); w[51] = __byte_perm_S (w[10], w[11], selector); w[50] = __byte_perm_S (w[ 9], w[10], selector); w[49] = __byte_perm_S (w[ 8], w[ 9], selector); w[48] = __byte_perm_S (w[ 7], w[ 8], selector); w[47] = __byte_perm_S (w[ 6], w[ 7], selector); w[46] = __byte_perm_S (w[ 5], w[ 6], selector); w[45] = __byte_perm_S (w[ 4], w[ 5], selector); w[44] = __byte_perm_S (w[ 3], w[ 4], selector); w[43] = __byte_perm_S (w[ 2], w[ 3], selector); w[42] = __byte_perm_S (w[ 1], w[ 2], selector); w[41] = __byte_perm_S (w[ 0], w[ 1], selector); w[40] = __byte_perm_S ( 0, w[ 0], selector); w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 41: w[63] = __byte_perm_S (w[21], w[22], selector); w[62] = __byte_perm_S (w[20], w[21], selector); w[61] = __byte_perm_S (w[19], w[20], selector); w[60] = __byte_perm_S (w[18], w[19], selector); w[59] = __byte_perm_S (w[17], w[18], selector); w[58] = __byte_perm_S (w[16], w[17], selector); w[57] = __byte_perm_S (w[15], w[16], selector); w[56] = __byte_perm_S (w[14], w[15], selector); w[55] = __byte_perm_S (w[13], w[14], selector); w[54] = __byte_perm_S (w[12], w[13], selector); w[53] = __byte_perm_S (w[11], w[12], selector); w[52] = __byte_perm_S (w[10], w[11], selector); w[51] = __byte_perm_S (w[ 9], w[10], selector); w[50] = __byte_perm_S (w[ 8], w[ 9], selector); w[49] = __byte_perm_S (w[ 7], w[ 8], selector); w[48] = __byte_perm_S (w[ 6], w[ 7], selector); w[47] = __byte_perm_S (w[ 5], w[ 6], selector); w[46] = __byte_perm_S (w[ 4], w[ 5], selector); w[45] = __byte_perm_S (w[ 3], w[ 4], selector); w[44] = __byte_perm_S (w[ 2], w[ 3], selector); w[43] = __byte_perm_S (w[ 1], w[ 2], selector); w[42] = __byte_perm_S (w[ 0], w[ 1], selector); w[41] = __byte_perm_S ( 0, w[ 0], selector); w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 42: w[63] = __byte_perm_S (w[20], w[21], selector); w[62] = __byte_perm_S (w[19], w[20], selector); w[61] = __byte_perm_S (w[18], w[19], selector); w[60] = __byte_perm_S (w[17], w[18], selector); w[59] = __byte_perm_S (w[16], w[17], selector); w[58] = __byte_perm_S (w[15], w[16], selector); w[57] = __byte_perm_S (w[14], w[15], selector); w[56] = __byte_perm_S (w[13], w[14], selector); w[55] = __byte_perm_S (w[12], w[13], selector); w[54] = __byte_perm_S (w[11], w[12], selector); w[53] = __byte_perm_S (w[10], w[11], selector); w[52] = __byte_perm_S (w[ 9], w[10], selector); w[51] = __byte_perm_S (w[ 8], w[ 9], selector); w[50] = __byte_perm_S (w[ 7], w[ 8], selector); w[49] = __byte_perm_S (w[ 6], w[ 7], selector); w[48] = __byte_perm_S (w[ 5], w[ 6], selector); w[47] = __byte_perm_S (w[ 4], w[ 5], selector); w[46] = __byte_perm_S (w[ 3], w[ 4], selector); w[45] = __byte_perm_S (w[ 2], w[ 3], selector); w[44] = __byte_perm_S (w[ 1], w[ 2], selector); w[43] = __byte_perm_S (w[ 0], w[ 1], selector); w[42] = __byte_perm_S ( 0, w[ 0], selector); w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 43: w[63] = __byte_perm_S (w[19], w[20], selector); w[62] = __byte_perm_S (w[18], w[19], selector); w[61] = __byte_perm_S (w[17], w[18], selector); w[60] = __byte_perm_S (w[16], w[17], selector); w[59] = __byte_perm_S (w[15], w[16], selector); w[58] = __byte_perm_S (w[14], w[15], selector); w[57] = __byte_perm_S (w[13], w[14], selector); w[56] = __byte_perm_S (w[12], w[13], selector); w[55] = __byte_perm_S (w[11], w[12], selector); w[54] = __byte_perm_S (w[10], w[11], selector); w[53] = __byte_perm_S (w[ 9], w[10], selector); w[52] = __byte_perm_S (w[ 8], w[ 9], selector); w[51] = __byte_perm_S (w[ 7], w[ 8], selector); w[50] = __byte_perm_S (w[ 6], w[ 7], selector); w[49] = __byte_perm_S (w[ 5], w[ 6], selector); w[48] = __byte_perm_S (w[ 4], w[ 5], selector); w[47] = __byte_perm_S (w[ 3], w[ 4], selector); w[46] = __byte_perm_S (w[ 2], w[ 3], selector); w[45] = __byte_perm_S (w[ 1], w[ 2], selector); w[44] = __byte_perm_S (w[ 0], w[ 1], selector); w[43] = __byte_perm_S ( 0, w[ 0], selector); w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 44: w[63] = __byte_perm_S (w[18], w[19], selector); w[62] = __byte_perm_S (w[17], w[18], selector); w[61] = __byte_perm_S (w[16], w[17], selector); w[60] = __byte_perm_S (w[15], w[16], selector); w[59] = __byte_perm_S (w[14], w[15], selector); w[58] = __byte_perm_S (w[13], w[14], selector); w[57] = __byte_perm_S (w[12], w[13], selector); w[56] = __byte_perm_S (w[11], w[12], selector); w[55] = __byte_perm_S (w[10], w[11], selector); w[54] = __byte_perm_S (w[ 9], w[10], selector); w[53] = __byte_perm_S (w[ 8], w[ 9], selector); w[52] = __byte_perm_S (w[ 7], w[ 8], selector); w[51] = __byte_perm_S (w[ 6], w[ 7], selector); w[50] = __byte_perm_S (w[ 5], w[ 6], selector); w[49] = __byte_perm_S (w[ 4], w[ 5], selector); w[48] = __byte_perm_S (w[ 3], w[ 4], selector); w[47] = __byte_perm_S (w[ 2], w[ 3], selector); w[46] = __byte_perm_S (w[ 1], w[ 2], selector); w[45] = __byte_perm_S (w[ 0], w[ 1], selector); w[44] = __byte_perm_S ( 0, w[ 0], selector); w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 45: w[63] = __byte_perm_S (w[17], w[18], selector); w[62] = __byte_perm_S (w[16], w[17], selector); w[61] = __byte_perm_S (w[15], w[16], selector); w[60] = __byte_perm_S (w[14], w[15], selector); w[59] = __byte_perm_S (w[13], w[14], selector); w[58] = __byte_perm_S (w[12], w[13], selector); w[57] = __byte_perm_S (w[11], w[12], selector); w[56] = __byte_perm_S (w[10], w[11], selector); w[55] = __byte_perm_S (w[ 9], w[10], selector); w[54] = __byte_perm_S (w[ 8], w[ 9], selector); w[53] = __byte_perm_S (w[ 7], w[ 8], selector); w[52] = __byte_perm_S (w[ 6], w[ 7], selector); w[51] = __byte_perm_S (w[ 5], w[ 6], selector); w[50] = __byte_perm_S (w[ 4], w[ 5], selector); w[49] = __byte_perm_S (w[ 3], w[ 4], selector); w[48] = __byte_perm_S (w[ 2], w[ 3], selector); w[47] = __byte_perm_S (w[ 1], w[ 2], selector); w[46] = __byte_perm_S (w[ 0], w[ 1], selector); w[45] = __byte_perm_S ( 0, w[ 0], selector); w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 46: w[63] = __byte_perm_S (w[16], w[17], selector); w[62] = __byte_perm_S (w[15], w[16], selector); w[61] = __byte_perm_S (w[14], w[15], selector); w[60] = __byte_perm_S (w[13], w[14], selector); w[59] = __byte_perm_S (w[12], w[13], selector); w[58] = __byte_perm_S (w[11], w[12], selector); w[57] = __byte_perm_S (w[10], w[11], selector); w[56] = __byte_perm_S (w[ 9], w[10], selector); w[55] = __byte_perm_S (w[ 8], w[ 9], selector); w[54] = __byte_perm_S (w[ 7], w[ 8], selector); w[53] = __byte_perm_S (w[ 6], w[ 7], selector); w[52] = __byte_perm_S (w[ 5], w[ 6], selector); w[51] = __byte_perm_S (w[ 4], w[ 5], selector); w[50] = __byte_perm_S (w[ 3], w[ 4], selector); w[49] = __byte_perm_S (w[ 2], w[ 3], selector); w[48] = __byte_perm_S (w[ 1], w[ 2], selector); w[47] = __byte_perm_S (w[ 0], w[ 1], selector); w[46] = __byte_perm_S ( 0, w[ 0], selector); w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 47: w[63] = __byte_perm_S (w[15], w[16], selector); w[62] = __byte_perm_S (w[14], w[15], selector); w[61] = __byte_perm_S (w[13], w[14], selector); w[60] = __byte_perm_S (w[12], w[13], selector); w[59] = __byte_perm_S (w[11], w[12], selector); w[58] = __byte_perm_S (w[10], w[11], selector); w[57] = __byte_perm_S (w[ 9], w[10], selector); w[56] = __byte_perm_S (w[ 8], w[ 9], selector); w[55] = __byte_perm_S (w[ 7], w[ 8], selector); w[54] = __byte_perm_S (w[ 6], w[ 7], selector); w[53] = __byte_perm_S (w[ 5], w[ 6], selector); w[52] = __byte_perm_S (w[ 4], w[ 5], selector); w[51] = __byte_perm_S (w[ 3], w[ 4], selector); w[50] = __byte_perm_S (w[ 2], w[ 3], selector); w[49] = __byte_perm_S (w[ 1], w[ 2], selector); w[48] = __byte_perm_S (w[ 0], w[ 1], selector); w[47] = __byte_perm_S ( 0, w[ 0], selector); w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 48: w[63] = __byte_perm_S (w[14], w[15], selector); w[62] = __byte_perm_S (w[13], w[14], selector); w[61] = __byte_perm_S (w[12], w[13], selector); w[60] = __byte_perm_S (w[11], w[12], selector); w[59] = __byte_perm_S (w[10], w[11], selector); w[58] = __byte_perm_S (w[ 9], w[10], selector); w[57] = __byte_perm_S (w[ 8], w[ 9], selector); w[56] = __byte_perm_S (w[ 7], w[ 8], selector); w[55] = __byte_perm_S (w[ 6], w[ 7], selector); w[54] = __byte_perm_S (w[ 5], w[ 6], selector); w[53] = __byte_perm_S (w[ 4], w[ 5], selector); w[52] = __byte_perm_S (w[ 3], w[ 4], selector); w[51] = __byte_perm_S (w[ 2], w[ 3], selector); w[50] = __byte_perm_S (w[ 1], w[ 2], selector); w[49] = __byte_perm_S (w[ 0], w[ 1], selector); w[48] = __byte_perm_S ( 0, w[ 0], selector); w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 49: w[63] = __byte_perm_S (w[13], w[14], selector); w[62] = __byte_perm_S (w[12], w[13], selector); w[61] = __byte_perm_S (w[11], w[12], selector); w[60] = __byte_perm_S (w[10], w[11], selector); w[59] = __byte_perm_S (w[ 9], w[10], selector); w[58] = __byte_perm_S (w[ 8], w[ 9], selector); w[57] = __byte_perm_S (w[ 7], w[ 8], selector); w[56] = __byte_perm_S (w[ 6], w[ 7], selector); w[55] = __byte_perm_S (w[ 5], w[ 6], selector); w[54] = __byte_perm_S (w[ 4], w[ 5], selector); w[53] = __byte_perm_S (w[ 3], w[ 4], selector); w[52] = __byte_perm_S (w[ 2], w[ 3], selector); w[51] = __byte_perm_S (w[ 1], w[ 2], selector); w[50] = __byte_perm_S (w[ 0], w[ 1], selector); w[49] = __byte_perm_S ( 0, w[ 0], selector); w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 50: w[63] = __byte_perm_S (w[12], w[13], selector); w[62] = __byte_perm_S (w[11], w[12], selector); w[61] = __byte_perm_S (w[10], w[11], selector); w[60] = __byte_perm_S (w[ 9], w[10], selector); w[59] = __byte_perm_S (w[ 8], w[ 9], selector); w[58] = __byte_perm_S (w[ 7], w[ 8], selector); w[57] = __byte_perm_S (w[ 6], w[ 7], selector); w[56] = __byte_perm_S (w[ 5], w[ 6], selector); w[55] = __byte_perm_S (w[ 4], w[ 5], selector); w[54] = __byte_perm_S (w[ 3], w[ 4], selector); w[53] = __byte_perm_S (w[ 2], w[ 3], selector); w[52] = __byte_perm_S (w[ 1], w[ 2], selector); w[51] = __byte_perm_S (w[ 0], w[ 1], selector); w[50] = __byte_perm_S ( 0, w[ 0], selector); w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 51: w[63] = __byte_perm_S (w[11], w[12], selector); w[62] = __byte_perm_S (w[10], w[11], selector); w[61] = __byte_perm_S (w[ 9], w[10], selector); w[60] = __byte_perm_S (w[ 8], w[ 9], selector); w[59] = __byte_perm_S (w[ 7], w[ 8], selector); w[58] = __byte_perm_S (w[ 6], w[ 7], selector); w[57] = __byte_perm_S (w[ 5], w[ 6], selector); w[56] = __byte_perm_S (w[ 4], w[ 5], selector); w[55] = __byte_perm_S (w[ 3], w[ 4], selector); w[54] = __byte_perm_S (w[ 2], w[ 3], selector); w[53] = __byte_perm_S (w[ 1], w[ 2], selector); w[52] = __byte_perm_S (w[ 0], w[ 1], selector); w[51] = __byte_perm_S ( 0, w[ 0], selector); w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 52: w[63] = __byte_perm_S (w[10], w[11], selector); w[62] = __byte_perm_S (w[ 9], w[10], selector); w[61] = __byte_perm_S (w[ 8], w[ 9], selector); w[60] = __byte_perm_S (w[ 7], w[ 8], selector); w[59] = __byte_perm_S (w[ 6], w[ 7], selector); w[58] = __byte_perm_S (w[ 5], w[ 6], selector); w[57] = __byte_perm_S (w[ 4], w[ 5], selector); w[56] = __byte_perm_S (w[ 3], w[ 4], selector); w[55] = __byte_perm_S (w[ 2], w[ 3], selector); w[54] = __byte_perm_S (w[ 1], w[ 2], selector); w[53] = __byte_perm_S (w[ 0], w[ 1], selector); w[52] = __byte_perm_S ( 0, w[ 0], selector); w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 53: w[63] = __byte_perm_S (w[ 9], w[10], selector); w[62] = __byte_perm_S (w[ 8], w[ 9], selector); w[61] = __byte_perm_S (w[ 7], w[ 8], selector); w[60] = __byte_perm_S (w[ 6], w[ 7], selector); w[59] = __byte_perm_S (w[ 5], w[ 6], selector); w[58] = __byte_perm_S (w[ 4], w[ 5], selector); w[57] = __byte_perm_S (w[ 3], w[ 4], selector); w[56] = __byte_perm_S (w[ 2], w[ 3], selector); w[55] = __byte_perm_S (w[ 1], w[ 2], selector); w[54] = __byte_perm_S (w[ 0], w[ 1], selector); w[53] = __byte_perm_S ( 0, w[ 0], selector); w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 54: w[63] = __byte_perm_S (w[ 8], w[ 9], selector); w[62] = __byte_perm_S (w[ 7], w[ 8], selector); w[61] = __byte_perm_S (w[ 6], w[ 7], selector); w[60] = __byte_perm_S (w[ 5], w[ 6], selector); w[59] = __byte_perm_S (w[ 4], w[ 5], selector); w[58] = __byte_perm_S (w[ 3], w[ 4], selector); w[57] = __byte_perm_S (w[ 2], w[ 3], selector); w[56] = __byte_perm_S (w[ 1], w[ 2], selector); w[55] = __byte_perm_S (w[ 0], w[ 1], selector); w[54] = __byte_perm_S ( 0, w[ 0], selector); w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 55: w[63] = __byte_perm_S (w[ 7], w[ 8], selector); w[62] = __byte_perm_S (w[ 6], w[ 7], selector); w[61] = __byte_perm_S (w[ 5], w[ 6], selector); w[60] = __byte_perm_S (w[ 4], w[ 5], selector); w[59] = __byte_perm_S (w[ 3], w[ 4], selector); w[58] = __byte_perm_S (w[ 2], w[ 3], selector); w[57] = __byte_perm_S (w[ 1], w[ 2], selector); w[56] = __byte_perm_S (w[ 0], w[ 1], selector); w[55] = __byte_perm_S ( 0, w[ 0], selector); w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 56: w[63] = __byte_perm_S (w[ 6], w[ 7], selector); w[62] = __byte_perm_S (w[ 5], w[ 6], selector); w[61] = __byte_perm_S (w[ 4], w[ 5], selector); w[60] = __byte_perm_S (w[ 3], w[ 4], selector); w[59] = __byte_perm_S (w[ 2], w[ 3], selector); w[58] = __byte_perm_S (w[ 1], w[ 2], selector); w[57] = __byte_perm_S (w[ 0], w[ 1], selector); w[56] = __byte_perm_S ( 0, w[ 0], selector); w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 57: w[63] = __byte_perm_S (w[ 5], w[ 6], selector); w[62] = __byte_perm_S (w[ 4], w[ 5], selector); w[61] = __byte_perm_S (w[ 3], w[ 4], selector); w[60] = __byte_perm_S (w[ 2], w[ 3], selector); w[59] = __byte_perm_S (w[ 1], w[ 2], selector); w[58] = __byte_perm_S (w[ 0], w[ 1], selector); w[57] = __byte_perm_S ( 0, w[ 0], selector); w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 58: w[63] = __byte_perm_S (w[ 4], w[ 5], selector); w[62] = __byte_perm_S (w[ 3], w[ 4], selector); w[61] = __byte_perm_S (w[ 2], w[ 3], selector); w[60] = __byte_perm_S (w[ 1], w[ 2], selector); w[59] = __byte_perm_S (w[ 0], w[ 1], selector); w[58] = __byte_perm_S ( 0, w[ 0], selector); w[57] = 0; w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 59: w[63] = __byte_perm_S (w[ 3], w[ 4], selector); w[62] = __byte_perm_S (w[ 2], w[ 3], selector); w[61] = __byte_perm_S (w[ 1], w[ 2], selector); w[60] = __byte_perm_S (w[ 0], w[ 1], selector); w[59] = __byte_perm_S ( 0, w[ 0], selector); w[58] = 0; w[57] = 0; w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 60: w[63] = __byte_perm_S (w[ 2], w[ 3], selector); w[62] = __byte_perm_S (w[ 1], w[ 2], selector); w[61] = __byte_perm_S (w[ 0], w[ 1], selector); w[60] = __byte_perm_S ( 0, w[ 0], selector); w[59] = 0; w[58] = 0; w[57] = 0; w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 61: w[63] = __byte_perm_S (w[ 1], w[ 2], selector); w[62] = __byte_perm_S (w[ 0], w[ 1], selector); w[61] = __byte_perm_S ( 0, w[ 0], selector); w[60] = 0; w[59] = 0; w[58] = 0; w[57] = 0; w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 62: w[63] = __byte_perm_S (w[ 0], w[ 1], selector); w[62] = __byte_perm_S ( 0, w[ 0], selector); w[61] = 0; w[60] = 0; w[59] = 0; w[58] = 0; w[57] = 0; w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 63: w[63] = __byte_perm_S ( 0, w[ 0], selector); w[62] = 0; w[61] = 0; w[60] = 0; w[59] = 0; w[58] = 0; w[57] = 0; w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; } #endif } DECLSPEC void switch_buffer_by_offset_1x64_be_S (u32 w[64], const u32 offset) { const int offset_switch = offset / 4; #if (defined IS_AMD && AMD_GCN < 3) || defined IS_GENERIC switch (offset_switch) { case 0: w[63] = amd_bytealign_S (w[62], w[63], offset); w[62] = amd_bytealign_S (w[61], w[62], offset); w[61] = amd_bytealign_S (w[60], w[61], offset); w[60] = amd_bytealign_S (w[59], w[60], offset); w[59] = amd_bytealign_S (w[58], w[59], offset); w[58] = amd_bytealign_S (w[57], w[58], offset); w[57] = amd_bytealign_S (w[56], w[57], offset); w[56] = amd_bytealign_S (w[55], w[56], offset); w[55] = amd_bytealign_S (w[54], w[55], offset); w[54] = amd_bytealign_S (w[53], w[54], offset); w[53] = amd_bytealign_S (w[52], w[53], offset); w[52] = amd_bytealign_S (w[51], w[52], offset); w[51] = amd_bytealign_S (w[50], w[51], offset); w[50] = amd_bytealign_S (w[49], w[50], offset); w[49] = amd_bytealign_S (w[48], w[49], offset); w[48] = amd_bytealign_S (w[47], w[48], offset); w[47] = amd_bytealign_S (w[46], w[47], offset); w[46] = amd_bytealign_S (w[45], w[46], offset); w[45] = amd_bytealign_S (w[44], w[45], offset); w[44] = amd_bytealign_S (w[43], w[44], offset); w[43] = amd_bytealign_S (w[42], w[43], offset); w[42] = amd_bytealign_S (w[41], w[42], offset); w[41] = amd_bytealign_S (w[40], w[41], offset); w[40] = amd_bytealign_S (w[39], w[40], offset); w[39] = amd_bytealign_S (w[38], w[39], offset); w[38] = amd_bytealign_S (w[37], w[38], offset); w[37] = amd_bytealign_S (w[36], w[37], offset); w[36] = amd_bytealign_S (w[35], w[36], offset); w[35] = amd_bytealign_S (w[34], w[35], offset); w[34] = amd_bytealign_S (w[33], w[34], offset); w[33] = amd_bytealign_S (w[32], w[33], offset); w[32] = amd_bytealign_S (w[31], w[32], offset); w[31] = amd_bytealign_S (w[30], w[31], offset); w[30] = amd_bytealign_S (w[29], w[30], offset); w[29] = amd_bytealign_S (w[28], w[29], offset); w[28] = amd_bytealign_S (w[27], w[28], offset); w[27] = amd_bytealign_S (w[26], w[27], offset); w[26] = amd_bytealign_S (w[25], w[26], offset); w[25] = amd_bytealign_S (w[24], w[25], offset); w[24] = amd_bytealign_S (w[23], w[24], offset); w[23] = amd_bytealign_S (w[22], w[23], offset); w[22] = amd_bytealign_S (w[21], w[22], offset); w[21] = amd_bytealign_S (w[20], w[21], offset); w[20] = amd_bytealign_S (w[19], w[20], offset); w[19] = amd_bytealign_S (w[18], w[19], offset); w[18] = amd_bytealign_S (w[17], w[18], offset); w[17] = amd_bytealign_S (w[16], w[17], offset); w[16] = amd_bytealign_S (w[15], w[16], offset); w[15] = amd_bytealign_S (w[14], w[15], offset); w[14] = amd_bytealign_S (w[13], w[14], offset); w[13] = amd_bytealign_S (w[12], w[13], offset); w[12] = amd_bytealign_S (w[11], w[12], offset); w[11] = amd_bytealign_S (w[10], w[11], offset); w[10] = amd_bytealign_S (w[ 9], w[10], offset); w[ 9] = amd_bytealign_S (w[ 8], w[ 9], offset); w[ 8] = amd_bytealign_S (w[ 7], w[ 8], offset); w[ 7] = amd_bytealign_S (w[ 6], w[ 7], offset); w[ 6] = amd_bytealign_S (w[ 5], w[ 6], offset); w[ 5] = amd_bytealign_S (w[ 4], w[ 5], offset); w[ 4] = amd_bytealign_S (w[ 3], w[ 4], offset); w[ 3] = amd_bytealign_S (w[ 2], w[ 3], offset); w[ 2] = amd_bytealign_S (w[ 1], w[ 2], offset); w[ 1] = amd_bytealign_S (w[ 0], w[ 1], offset); w[ 0] = amd_bytealign_S ( 0, w[ 0], offset); break; case 1: w[63] = amd_bytealign_S (w[61], w[62], offset); w[62] = amd_bytealign_S (w[60], w[61], offset); w[61] = amd_bytealign_S (w[59], w[60], offset); w[60] = amd_bytealign_S (w[58], w[59], offset); w[59] = amd_bytealign_S (w[57], w[58], offset); w[58] = amd_bytealign_S (w[56], w[57], offset); w[57] = amd_bytealign_S (w[55], w[56], offset); w[56] = amd_bytealign_S (w[54], w[55], offset); w[55] = amd_bytealign_S (w[53], w[54], offset); w[54] = amd_bytealign_S (w[52], w[53], offset); w[53] = amd_bytealign_S (w[51], w[52], offset); w[52] = amd_bytealign_S (w[50], w[51], offset); w[51] = amd_bytealign_S (w[49], w[50], offset); w[50] = amd_bytealign_S (w[48], w[49], offset); w[49] = amd_bytealign_S (w[47], w[48], offset); w[48] = amd_bytealign_S (w[46], w[47], offset); w[47] = amd_bytealign_S (w[45], w[46], offset); w[46] = amd_bytealign_S (w[44], w[45], offset); w[45] = amd_bytealign_S (w[43], w[44], offset); w[44] = amd_bytealign_S (w[42], w[43], offset); w[43] = amd_bytealign_S (w[41], w[42], offset); w[42] = amd_bytealign_S (w[40], w[41], offset); w[41] = amd_bytealign_S (w[39], w[40], offset); w[40] = amd_bytealign_S (w[38], w[39], offset); w[39] = amd_bytealign_S (w[37], w[38], offset); w[38] = amd_bytealign_S (w[36], w[37], offset); w[37] = amd_bytealign_S (w[35], w[36], offset); w[36] = amd_bytealign_S (w[34], w[35], offset); w[35] = amd_bytealign_S (w[33], w[34], offset); w[34] = amd_bytealign_S (w[32], w[33], offset); w[33] = amd_bytealign_S (w[31], w[32], offset); w[32] = amd_bytealign_S (w[30], w[31], offset); w[31] = amd_bytealign_S (w[29], w[30], offset); w[30] = amd_bytealign_S (w[28], w[29], offset); w[29] = amd_bytealign_S (w[27], w[28], offset); w[28] = amd_bytealign_S (w[26], w[27], offset); w[27] = amd_bytealign_S (w[25], w[26], offset); w[26] = amd_bytealign_S (w[24], w[25], offset); w[25] = amd_bytealign_S (w[23], w[24], offset); w[24] = amd_bytealign_S (w[22], w[23], offset); w[23] = amd_bytealign_S (w[21], w[22], offset); w[22] = amd_bytealign_S (w[20], w[21], offset); w[21] = amd_bytealign_S (w[19], w[20], offset); w[20] = amd_bytealign_S (w[18], w[19], offset); w[19] = amd_bytealign_S (w[17], w[18], offset); w[18] = amd_bytealign_S (w[16], w[17], offset); w[17] = amd_bytealign_S (w[15], w[16], offset); w[16] = amd_bytealign_S (w[14], w[15], offset); w[15] = amd_bytealign_S (w[13], w[14], offset); w[14] = amd_bytealign_S (w[12], w[13], offset); w[13] = amd_bytealign_S (w[11], w[12], offset); w[12] = amd_bytealign_S (w[10], w[11], offset); w[11] = amd_bytealign_S (w[ 9], w[10], offset); w[10] = amd_bytealign_S (w[ 8], w[ 9], offset); w[ 9] = amd_bytealign_S (w[ 7], w[ 8], offset); w[ 8] = amd_bytealign_S (w[ 6], w[ 7], offset); w[ 7] = amd_bytealign_S (w[ 5], w[ 6], offset); w[ 6] = amd_bytealign_S (w[ 4], w[ 5], offset); w[ 5] = amd_bytealign_S (w[ 3], w[ 4], offset); w[ 4] = amd_bytealign_S (w[ 2], w[ 3], offset); w[ 3] = amd_bytealign_S (w[ 1], w[ 2], offset); w[ 2] = amd_bytealign_S (w[ 0], w[ 1], offset); w[ 1] = amd_bytealign_S ( 0, w[ 0], offset); w[ 0] = 0; break; case 2: w[63] = amd_bytealign_S (w[60], w[61], offset); w[62] = amd_bytealign_S (w[59], w[60], offset); w[61] = amd_bytealign_S (w[58], w[59], offset); w[60] = amd_bytealign_S (w[57], w[58], offset); w[59] = amd_bytealign_S (w[56], w[57], offset); w[58] = amd_bytealign_S (w[55], w[56], offset); w[57] = amd_bytealign_S (w[54], w[55], offset); w[56] = amd_bytealign_S (w[53], w[54], offset); w[55] = amd_bytealign_S (w[52], w[53], offset); w[54] = amd_bytealign_S (w[51], w[52], offset); w[53] = amd_bytealign_S (w[50], w[51], offset); w[52] = amd_bytealign_S (w[49], w[50], offset); w[51] = amd_bytealign_S (w[48], w[49], offset); w[50] = amd_bytealign_S (w[47], w[48], offset); w[49] = amd_bytealign_S (w[46], w[47], offset); w[48] = amd_bytealign_S (w[45], w[46], offset); w[47] = amd_bytealign_S (w[44], w[45], offset); w[46] = amd_bytealign_S (w[43], w[44], offset); w[45] = amd_bytealign_S (w[42], w[43], offset); w[44] = amd_bytealign_S (w[41], w[42], offset); w[43] = amd_bytealign_S (w[40], w[41], offset); w[42] = amd_bytealign_S (w[39], w[40], offset); w[41] = amd_bytealign_S (w[38], w[39], offset); w[40] = amd_bytealign_S (w[37], w[38], offset); w[39] = amd_bytealign_S (w[36], w[37], offset); w[38] = amd_bytealign_S (w[35], w[36], offset); w[37] = amd_bytealign_S (w[34], w[35], offset); w[36] = amd_bytealign_S (w[33], w[34], offset); w[35] = amd_bytealign_S (w[32], w[33], offset); w[34] = amd_bytealign_S (w[31], w[32], offset); w[33] = amd_bytealign_S (w[30], w[31], offset); w[32] = amd_bytealign_S (w[29], w[30], offset); w[31] = amd_bytealign_S (w[28], w[29], offset); w[30] = amd_bytealign_S (w[27], w[28], offset); w[29] = amd_bytealign_S (w[26], w[27], offset); w[28] = amd_bytealign_S (w[25], w[26], offset); w[27] = amd_bytealign_S (w[24], w[25], offset); w[26] = amd_bytealign_S (w[23], w[24], offset); w[25] = amd_bytealign_S (w[22], w[23], offset); w[24] = amd_bytealign_S (w[21], w[22], offset); w[23] = amd_bytealign_S (w[20], w[21], offset); w[22] = amd_bytealign_S (w[19], w[20], offset); w[21] = amd_bytealign_S (w[18], w[19], offset); w[20] = amd_bytealign_S (w[17], w[18], offset); w[19] = amd_bytealign_S (w[16], w[17], offset); w[18] = amd_bytealign_S (w[15], w[16], offset); w[17] = amd_bytealign_S (w[14], w[15], offset); w[16] = amd_bytealign_S (w[13], w[14], offset); w[15] = amd_bytealign_S (w[12], w[13], offset); w[14] = amd_bytealign_S (w[11], w[12], offset); w[13] = amd_bytealign_S (w[10], w[11], offset); w[12] = amd_bytealign_S (w[ 9], w[10], offset); w[11] = amd_bytealign_S (w[ 8], w[ 9], offset); w[10] = amd_bytealign_S (w[ 7], w[ 8], offset); w[ 9] = amd_bytealign_S (w[ 6], w[ 7], offset); w[ 8] = amd_bytealign_S (w[ 5], w[ 6], offset); w[ 7] = amd_bytealign_S (w[ 4], w[ 5], offset); w[ 6] = amd_bytealign_S (w[ 3], w[ 4], offset); w[ 5] = amd_bytealign_S (w[ 2], w[ 3], offset); w[ 4] = amd_bytealign_S (w[ 1], w[ 2], offset); w[ 3] = amd_bytealign_S (w[ 0], w[ 1], offset); w[ 2] = amd_bytealign_S ( 0, w[ 0], offset); w[ 1] = 0; w[ 0] = 0; break; case 3: w[63] = amd_bytealign_S (w[59], w[60], offset); w[62] = amd_bytealign_S (w[58], w[59], offset); w[61] = amd_bytealign_S (w[57], w[58], offset); w[60] = amd_bytealign_S (w[56], w[57], offset); w[59] = amd_bytealign_S (w[55], w[56], offset); w[58] = amd_bytealign_S (w[54], w[55], offset); w[57] = amd_bytealign_S (w[53], w[54], offset); w[56] = amd_bytealign_S (w[52], w[53], offset); w[55] = amd_bytealign_S (w[51], w[52], offset); w[54] = amd_bytealign_S (w[50], w[51], offset); w[53] = amd_bytealign_S (w[49], w[50], offset); w[52] = amd_bytealign_S (w[48], w[49], offset); w[51] = amd_bytealign_S (w[47], w[48], offset); w[50] = amd_bytealign_S (w[46], w[47], offset); w[49] = amd_bytealign_S (w[45], w[46], offset); w[48] = amd_bytealign_S (w[44], w[45], offset); w[47] = amd_bytealign_S (w[43], w[44], offset); w[46] = amd_bytealign_S (w[42], w[43], offset); w[45] = amd_bytealign_S (w[41], w[42], offset); w[44] = amd_bytealign_S (w[40], w[41], offset); w[43] = amd_bytealign_S (w[39], w[40], offset); w[42] = amd_bytealign_S (w[38], w[39], offset); w[41] = amd_bytealign_S (w[37], w[38], offset); w[40] = amd_bytealign_S (w[36], w[37], offset); w[39] = amd_bytealign_S (w[35], w[36], offset); w[38] = amd_bytealign_S (w[34], w[35], offset); w[37] = amd_bytealign_S (w[33], w[34], offset); w[36] = amd_bytealign_S (w[32], w[33], offset); w[35] = amd_bytealign_S (w[31], w[32], offset); w[34] = amd_bytealign_S (w[30], w[31], offset); w[33] = amd_bytealign_S (w[29], w[30], offset); w[32] = amd_bytealign_S (w[28], w[29], offset); w[31] = amd_bytealign_S (w[27], w[28], offset); w[30] = amd_bytealign_S (w[26], w[27], offset); w[29] = amd_bytealign_S (w[25], w[26], offset); w[28] = amd_bytealign_S (w[24], w[25], offset); w[27] = amd_bytealign_S (w[23], w[24], offset); w[26] = amd_bytealign_S (w[22], w[23], offset); w[25] = amd_bytealign_S (w[21], w[22], offset); w[24] = amd_bytealign_S (w[20], w[21], offset); w[23] = amd_bytealign_S (w[19], w[20], offset); w[22] = amd_bytealign_S (w[18], w[19], offset); w[21] = amd_bytealign_S (w[17], w[18], offset); w[20] = amd_bytealign_S (w[16], w[17], offset); w[19] = amd_bytealign_S (w[15], w[16], offset); w[18] = amd_bytealign_S (w[14], w[15], offset); w[17] = amd_bytealign_S (w[13], w[14], offset); w[16] = amd_bytealign_S (w[12], w[13], offset); w[15] = amd_bytealign_S (w[11], w[12], offset); w[14] = amd_bytealign_S (w[10], w[11], offset); w[13] = amd_bytealign_S (w[ 9], w[10], offset); w[12] = amd_bytealign_S (w[ 8], w[ 9], offset); w[11] = amd_bytealign_S (w[ 7], w[ 8], offset); w[10] = amd_bytealign_S (w[ 6], w[ 7], offset); w[ 9] = amd_bytealign_S (w[ 5], w[ 6], offset); w[ 8] = amd_bytealign_S (w[ 4], w[ 5], offset); w[ 7] = amd_bytealign_S (w[ 3], w[ 4], offset); w[ 6] = amd_bytealign_S (w[ 2], w[ 3], offset); w[ 5] = amd_bytealign_S (w[ 1], w[ 2], offset); w[ 4] = amd_bytealign_S (w[ 0], w[ 1], offset); w[ 3] = amd_bytealign_S ( 0, w[ 0], offset); w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 4: w[63] = amd_bytealign_S (w[58], w[59], offset); w[62] = amd_bytealign_S (w[57], w[58], offset); w[61] = amd_bytealign_S (w[56], w[57], offset); w[60] = amd_bytealign_S (w[55], w[56], offset); w[59] = amd_bytealign_S (w[54], w[55], offset); w[58] = amd_bytealign_S (w[53], w[54], offset); w[57] = amd_bytealign_S (w[52], w[53], offset); w[56] = amd_bytealign_S (w[51], w[52], offset); w[55] = amd_bytealign_S (w[50], w[51], offset); w[54] = amd_bytealign_S (w[49], w[50], offset); w[53] = amd_bytealign_S (w[48], w[49], offset); w[52] = amd_bytealign_S (w[47], w[48], offset); w[51] = amd_bytealign_S (w[46], w[47], offset); w[50] = amd_bytealign_S (w[45], w[46], offset); w[49] = amd_bytealign_S (w[44], w[45], offset); w[48] = amd_bytealign_S (w[43], w[44], offset); w[47] = amd_bytealign_S (w[42], w[43], offset); w[46] = amd_bytealign_S (w[41], w[42], offset); w[45] = amd_bytealign_S (w[40], w[41], offset); w[44] = amd_bytealign_S (w[39], w[40], offset); w[43] = amd_bytealign_S (w[38], w[39], offset); w[42] = amd_bytealign_S (w[37], w[38], offset); w[41] = amd_bytealign_S (w[36], w[37], offset); w[40] = amd_bytealign_S (w[35], w[36], offset); w[39] = amd_bytealign_S (w[34], w[35], offset); w[38] = amd_bytealign_S (w[33], w[34], offset); w[37] = amd_bytealign_S (w[32], w[33], offset); w[36] = amd_bytealign_S (w[31], w[32], offset); w[35] = amd_bytealign_S (w[30], w[31], offset); w[34] = amd_bytealign_S (w[29], w[30], offset); w[33] = amd_bytealign_S (w[28], w[29], offset); w[32] = amd_bytealign_S (w[27], w[28], offset); w[31] = amd_bytealign_S (w[26], w[27], offset); w[30] = amd_bytealign_S (w[25], w[26], offset); w[29] = amd_bytealign_S (w[24], w[25], offset); w[28] = amd_bytealign_S (w[23], w[24], offset); w[27] = amd_bytealign_S (w[22], w[23], offset); w[26] = amd_bytealign_S (w[21], w[22], offset); w[25] = amd_bytealign_S (w[20], w[21], offset); w[24] = amd_bytealign_S (w[19], w[20], offset); w[23] = amd_bytealign_S (w[18], w[19], offset); w[22] = amd_bytealign_S (w[17], w[18], offset); w[21] = amd_bytealign_S (w[16], w[17], offset); w[20] = amd_bytealign_S (w[15], w[16], offset); w[19] = amd_bytealign_S (w[14], w[15], offset); w[18] = amd_bytealign_S (w[13], w[14], offset); w[17] = amd_bytealign_S (w[12], w[13], offset); w[16] = amd_bytealign_S (w[11], w[12], offset); w[15] = amd_bytealign_S (w[10], w[11], offset); w[14] = amd_bytealign_S (w[ 9], w[10], offset); w[13] = amd_bytealign_S (w[ 8], w[ 9], offset); w[12] = amd_bytealign_S (w[ 7], w[ 8], offset); w[11] = amd_bytealign_S (w[ 6], w[ 7], offset); w[10] = amd_bytealign_S (w[ 5], w[ 6], offset); w[ 9] = amd_bytealign_S (w[ 4], w[ 5], offset); w[ 8] = amd_bytealign_S (w[ 3], w[ 4], offset); w[ 7] = amd_bytealign_S (w[ 2], w[ 3], offset); w[ 6] = amd_bytealign_S (w[ 1], w[ 2], offset); w[ 5] = amd_bytealign_S (w[ 0], w[ 1], offset); w[ 4] = amd_bytealign_S ( 0, w[ 0], offset); w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 5: w[63] = amd_bytealign_S (w[57], w[58], offset); w[62] = amd_bytealign_S (w[56], w[57], offset); w[61] = amd_bytealign_S (w[55], w[56], offset); w[60] = amd_bytealign_S (w[54], w[55], offset); w[59] = amd_bytealign_S (w[53], w[54], offset); w[58] = amd_bytealign_S (w[52], w[53], offset); w[57] = amd_bytealign_S (w[51], w[52], offset); w[56] = amd_bytealign_S (w[50], w[51], offset); w[55] = amd_bytealign_S (w[49], w[50], offset); w[54] = amd_bytealign_S (w[48], w[49], offset); w[53] = amd_bytealign_S (w[47], w[48], offset); w[52] = amd_bytealign_S (w[46], w[47], offset); w[51] = amd_bytealign_S (w[45], w[46], offset); w[50] = amd_bytealign_S (w[44], w[45], offset); w[49] = amd_bytealign_S (w[43], w[44], offset); w[48] = amd_bytealign_S (w[42], w[43], offset); w[47] = amd_bytealign_S (w[41], w[42], offset); w[46] = amd_bytealign_S (w[40], w[41], offset); w[45] = amd_bytealign_S (w[39], w[40], offset); w[44] = amd_bytealign_S (w[38], w[39], offset); w[43] = amd_bytealign_S (w[37], w[38], offset); w[42] = amd_bytealign_S (w[36], w[37], offset); w[41] = amd_bytealign_S (w[35], w[36], offset); w[40] = amd_bytealign_S (w[34], w[35], offset); w[39] = amd_bytealign_S (w[33], w[34], offset); w[38] = amd_bytealign_S (w[32], w[33], offset); w[37] = amd_bytealign_S (w[31], w[32], offset); w[36] = amd_bytealign_S (w[30], w[31], offset); w[35] = amd_bytealign_S (w[29], w[30], offset); w[34] = amd_bytealign_S (w[28], w[29], offset); w[33] = amd_bytealign_S (w[27], w[28], offset); w[32] = amd_bytealign_S (w[26], w[27], offset); w[31] = amd_bytealign_S (w[25], w[26], offset); w[30] = amd_bytealign_S (w[24], w[25], offset); w[29] = amd_bytealign_S (w[23], w[24], offset); w[28] = amd_bytealign_S (w[22], w[23], offset); w[27] = amd_bytealign_S (w[21], w[22], offset); w[26] = amd_bytealign_S (w[20], w[21], offset); w[25] = amd_bytealign_S (w[19], w[20], offset); w[24] = amd_bytealign_S (w[18], w[19], offset); w[23] = amd_bytealign_S (w[17], w[18], offset); w[22] = amd_bytealign_S (w[16], w[17], offset); w[21] = amd_bytealign_S (w[15], w[16], offset); w[20] = amd_bytealign_S (w[14], w[15], offset); w[19] = amd_bytealign_S (w[13], w[14], offset); w[18] = amd_bytealign_S (w[12], w[13], offset); w[17] = amd_bytealign_S (w[11], w[12], offset); w[16] = amd_bytealign_S (w[10], w[11], offset); w[15] = amd_bytealign_S (w[ 9], w[10], offset); w[14] = amd_bytealign_S (w[ 8], w[ 9], offset); w[13] = amd_bytealign_S (w[ 7], w[ 8], offset); w[12] = amd_bytealign_S (w[ 6], w[ 7], offset); w[11] = amd_bytealign_S (w[ 5], w[ 6], offset); w[10] = amd_bytealign_S (w[ 4], w[ 5], offset); w[ 9] = amd_bytealign_S (w[ 3], w[ 4], offset); w[ 8] = amd_bytealign_S (w[ 2], w[ 3], offset); w[ 7] = amd_bytealign_S (w[ 1], w[ 2], offset); w[ 6] = amd_bytealign_S (w[ 0], w[ 1], offset); w[ 5] = amd_bytealign_S ( 0, w[ 0], offset); w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 6: w[63] = amd_bytealign_S (w[56], w[57], offset); w[62] = amd_bytealign_S (w[55], w[56], offset); w[61] = amd_bytealign_S (w[54], w[55], offset); w[60] = amd_bytealign_S (w[53], w[54], offset); w[59] = amd_bytealign_S (w[52], w[53], offset); w[58] = amd_bytealign_S (w[51], w[52], offset); w[57] = amd_bytealign_S (w[50], w[51], offset); w[56] = amd_bytealign_S (w[49], w[50], offset); w[55] = amd_bytealign_S (w[48], w[49], offset); w[54] = amd_bytealign_S (w[47], w[48], offset); w[53] = amd_bytealign_S (w[46], w[47], offset); w[52] = amd_bytealign_S (w[45], w[46], offset); w[51] = amd_bytealign_S (w[44], w[45], offset); w[50] = amd_bytealign_S (w[43], w[44], offset); w[49] = amd_bytealign_S (w[42], w[43], offset); w[48] = amd_bytealign_S (w[41], w[42], offset); w[47] = amd_bytealign_S (w[40], w[41], offset); w[46] = amd_bytealign_S (w[39], w[40], offset); w[45] = amd_bytealign_S (w[38], w[39], offset); w[44] = amd_bytealign_S (w[37], w[38], offset); w[43] = amd_bytealign_S (w[36], w[37], offset); w[42] = amd_bytealign_S (w[35], w[36], offset); w[41] = amd_bytealign_S (w[34], w[35], offset); w[40] = amd_bytealign_S (w[33], w[34], offset); w[39] = amd_bytealign_S (w[32], w[33], offset); w[38] = amd_bytealign_S (w[31], w[32], offset); w[37] = amd_bytealign_S (w[30], w[31], offset); w[36] = amd_bytealign_S (w[29], w[30], offset); w[35] = amd_bytealign_S (w[28], w[29], offset); w[34] = amd_bytealign_S (w[27], w[28], offset); w[33] = amd_bytealign_S (w[26], w[27], offset); w[32] = amd_bytealign_S (w[25], w[26], offset); w[31] = amd_bytealign_S (w[24], w[25], offset); w[30] = amd_bytealign_S (w[23], w[24], offset); w[29] = amd_bytealign_S (w[22], w[23], offset); w[28] = amd_bytealign_S (w[21], w[22], offset); w[27] = amd_bytealign_S (w[20], w[21], offset); w[26] = amd_bytealign_S (w[19], w[20], offset); w[25] = amd_bytealign_S (w[18], w[19], offset); w[24] = amd_bytealign_S (w[17], w[18], offset); w[23] = amd_bytealign_S (w[16], w[17], offset); w[22] = amd_bytealign_S (w[15], w[16], offset); w[21] = amd_bytealign_S (w[14], w[15], offset); w[20] = amd_bytealign_S (w[13], w[14], offset); w[19] = amd_bytealign_S (w[12], w[13], offset); w[18] = amd_bytealign_S (w[11], w[12], offset); w[17] = amd_bytealign_S (w[10], w[11], offset); w[16] = amd_bytealign_S (w[ 9], w[10], offset); w[15] = amd_bytealign_S (w[ 8], w[ 9], offset); w[14] = amd_bytealign_S (w[ 7], w[ 8], offset); w[13] = amd_bytealign_S (w[ 6], w[ 7], offset); w[12] = amd_bytealign_S (w[ 5], w[ 6], offset); w[11] = amd_bytealign_S (w[ 4], w[ 5], offset); w[10] = amd_bytealign_S (w[ 3], w[ 4], offset); w[ 9] = amd_bytealign_S (w[ 2], w[ 3], offset); w[ 8] = amd_bytealign_S (w[ 1], w[ 2], offset); w[ 7] = amd_bytealign_S (w[ 0], w[ 1], offset); w[ 6] = amd_bytealign_S ( 0, w[ 0], offset); w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 7: w[63] = amd_bytealign_S (w[55], w[56], offset); w[62] = amd_bytealign_S (w[54], w[55], offset); w[61] = amd_bytealign_S (w[53], w[54], offset); w[60] = amd_bytealign_S (w[52], w[53], offset); w[59] = amd_bytealign_S (w[51], w[52], offset); w[58] = amd_bytealign_S (w[50], w[51], offset); w[57] = amd_bytealign_S (w[49], w[50], offset); w[56] = amd_bytealign_S (w[48], w[49], offset); w[55] = amd_bytealign_S (w[47], w[48], offset); w[54] = amd_bytealign_S (w[46], w[47], offset); w[53] = amd_bytealign_S (w[45], w[46], offset); w[52] = amd_bytealign_S (w[44], w[45], offset); w[51] = amd_bytealign_S (w[43], w[44], offset); w[50] = amd_bytealign_S (w[42], w[43], offset); w[49] = amd_bytealign_S (w[41], w[42], offset); w[48] = amd_bytealign_S (w[40], w[41], offset); w[47] = amd_bytealign_S (w[39], w[40], offset); w[46] = amd_bytealign_S (w[38], w[39], offset); w[45] = amd_bytealign_S (w[37], w[38], offset); w[44] = amd_bytealign_S (w[36], w[37], offset); w[43] = amd_bytealign_S (w[35], w[36], offset); w[42] = amd_bytealign_S (w[34], w[35], offset); w[41] = amd_bytealign_S (w[33], w[34], offset); w[40] = amd_bytealign_S (w[32], w[33], offset); w[39] = amd_bytealign_S (w[31], w[32], offset); w[38] = amd_bytealign_S (w[30], w[31], offset); w[37] = amd_bytealign_S (w[29], w[30], offset); w[36] = amd_bytealign_S (w[28], w[29], offset); w[35] = amd_bytealign_S (w[27], w[28], offset); w[34] = amd_bytealign_S (w[26], w[27], offset); w[33] = amd_bytealign_S (w[25], w[26], offset); w[32] = amd_bytealign_S (w[24], w[25], offset); w[31] = amd_bytealign_S (w[23], w[24], offset); w[30] = amd_bytealign_S (w[22], w[23], offset); w[29] = amd_bytealign_S (w[21], w[22], offset); w[28] = amd_bytealign_S (w[20], w[21], offset); w[27] = amd_bytealign_S (w[19], w[20], offset); w[26] = amd_bytealign_S (w[18], w[19], offset); w[25] = amd_bytealign_S (w[17], w[18], offset); w[24] = amd_bytealign_S (w[16], w[17], offset); w[23] = amd_bytealign_S (w[15], w[16], offset); w[22] = amd_bytealign_S (w[14], w[15], offset); w[21] = amd_bytealign_S (w[13], w[14], offset); w[20] = amd_bytealign_S (w[12], w[13], offset); w[19] = amd_bytealign_S (w[11], w[12], offset); w[18] = amd_bytealign_S (w[10], w[11], offset); w[17] = amd_bytealign_S (w[ 9], w[10], offset); w[16] = amd_bytealign_S (w[ 8], w[ 9], offset); w[15] = amd_bytealign_S (w[ 7], w[ 8], offset); w[14] = amd_bytealign_S (w[ 6], w[ 7], offset); w[13] = amd_bytealign_S (w[ 5], w[ 6], offset); w[12] = amd_bytealign_S (w[ 4], w[ 5], offset); w[11] = amd_bytealign_S (w[ 3], w[ 4], offset); w[10] = amd_bytealign_S (w[ 2], w[ 3], offset); w[ 9] = amd_bytealign_S (w[ 1], w[ 2], offset); w[ 8] = amd_bytealign_S (w[ 0], w[ 1], offset); w[ 7] = amd_bytealign_S ( 0, w[ 0], offset); w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 8: w[63] = amd_bytealign_S (w[54], w[55], offset); w[62] = amd_bytealign_S (w[53], w[54], offset); w[61] = amd_bytealign_S (w[52], w[53], offset); w[60] = amd_bytealign_S (w[51], w[52], offset); w[59] = amd_bytealign_S (w[50], w[51], offset); w[58] = amd_bytealign_S (w[49], w[50], offset); w[57] = amd_bytealign_S (w[48], w[49], offset); w[56] = amd_bytealign_S (w[47], w[48], offset); w[55] = amd_bytealign_S (w[46], w[47], offset); w[54] = amd_bytealign_S (w[45], w[46], offset); w[53] = amd_bytealign_S (w[44], w[45], offset); w[52] = amd_bytealign_S (w[43], w[44], offset); w[51] = amd_bytealign_S (w[42], w[43], offset); w[50] = amd_bytealign_S (w[41], w[42], offset); w[49] = amd_bytealign_S (w[40], w[41], offset); w[48] = amd_bytealign_S (w[39], w[40], offset); w[47] = amd_bytealign_S (w[38], w[39], offset); w[46] = amd_bytealign_S (w[37], w[38], offset); w[45] = amd_bytealign_S (w[36], w[37], offset); w[44] = amd_bytealign_S (w[35], w[36], offset); w[43] = amd_bytealign_S (w[34], w[35], offset); w[42] = amd_bytealign_S (w[33], w[34], offset); w[41] = amd_bytealign_S (w[32], w[33], offset); w[40] = amd_bytealign_S (w[31], w[32], offset); w[39] = amd_bytealign_S (w[30], w[31], offset); w[38] = amd_bytealign_S (w[29], w[30], offset); w[37] = amd_bytealign_S (w[28], w[29], offset); w[36] = amd_bytealign_S (w[27], w[28], offset); w[35] = amd_bytealign_S (w[26], w[27], offset); w[34] = amd_bytealign_S (w[25], w[26], offset); w[33] = amd_bytealign_S (w[24], w[25], offset); w[32] = amd_bytealign_S (w[23], w[24], offset); w[31] = amd_bytealign_S (w[22], w[23], offset); w[30] = amd_bytealign_S (w[21], w[22], offset); w[29] = amd_bytealign_S (w[20], w[21], offset); w[28] = amd_bytealign_S (w[19], w[20], offset); w[27] = amd_bytealign_S (w[18], w[19], offset); w[26] = amd_bytealign_S (w[17], w[18], offset); w[25] = amd_bytealign_S (w[16], w[17], offset); w[24] = amd_bytealign_S (w[15], w[16], offset); w[23] = amd_bytealign_S (w[14], w[15], offset); w[22] = amd_bytealign_S (w[13], w[14], offset); w[21] = amd_bytealign_S (w[12], w[13], offset); w[20] = amd_bytealign_S (w[11], w[12], offset); w[19] = amd_bytealign_S (w[10], w[11], offset); w[18] = amd_bytealign_S (w[ 9], w[10], offset); w[17] = amd_bytealign_S (w[ 8], w[ 9], offset); w[16] = amd_bytealign_S (w[ 7], w[ 8], offset); w[15] = amd_bytealign_S (w[ 6], w[ 7], offset); w[14] = amd_bytealign_S (w[ 5], w[ 6], offset); w[13] = amd_bytealign_S (w[ 4], w[ 5], offset); w[12] = amd_bytealign_S (w[ 3], w[ 4], offset); w[11] = amd_bytealign_S (w[ 2], w[ 3], offset); w[10] = amd_bytealign_S (w[ 1], w[ 2], offset); w[ 9] = amd_bytealign_S (w[ 0], w[ 1], offset); w[ 8] = amd_bytealign_S ( 0, w[ 0], offset); w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 9: w[63] = amd_bytealign_S (w[53], w[54], offset); w[62] = amd_bytealign_S (w[52], w[53], offset); w[61] = amd_bytealign_S (w[51], w[52], offset); w[60] = amd_bytealign_S (w[50], w[51], offset); w[59] = amd_bytealign_S (w[49], w[50], offset); w[58] = amd_bytealign_S (w[48], w[49], offset); w[57] = amd_bytealign_S (w[47], w[48], offset); w[56] = amd_bytealign_S (w[46], w[47], offset); w[55] = amd_bytealign_S (w[45], w[46], offset); w[54] = amd_bytealign_S (w[44], w[45], offset); w[53] = amd_bytealign_S (w[43], w[44], offset); w[52] = amd_bytealign_S (w[42], w[43], offset); w[51] = amd_bytealign_S (w[41], w[42], offset); w[50] = amd_bytealign_S (w[40], w[41], offset); w[49] = amd_bytealign_S (w[39], w[40], offset); w[48] = amd_bytealign_S (w[38], w[39], offset); w[47] = amd_bytealign_S (w[37], w[38], offset); w[46] = amd_bytealign_S (w[36], w[37], offset); w[45] = amd_bytealign_S (w[35], w[36], offset); w[44] = amd_bytealign_S (w[34], w[35], offset); w[43] = amd_bytealign_S (w[33], w[34], offset); w[42] = amd_bytealign_S (w[32], w[33], offset); w[41] = amd_bytealign_S (w[31], w[32], offset); w[40] = amd_bytealign_S (w[30], w[31], offset); w[39] = amd_bytealign_S (w[29], w[30], offset); w[38] = amd_bytealign_S (w[28], w[29], offset); w[37] = amd_bytealign_S (w[27], w[28], offset); w[36] = amd_bytealign_S (w[26], w[27], offset); w[35] = amd_bytealign_S (w[25], w[26], offset); w[34] = amd_bytealign_S (w[24], w[25], offset); w[33] = amd_bytealign_S (w[23], w[24], offset); w[32] = amd_bytealign_S (w[22], w[23], offset); w[31] = amd_bytealign_S (w[21], w[22], offset); w[30] = amd_bytealign_S (w[20], w[21], offset); w[29] = amd_bytealign_S (w[19], w[20], offset); w[28] = amd_bytealign_S (w[18], w[19], offset); w[27] = amd_bytealign_S (w[17], w[18], offset); w[26] = amd_bytealign_S (w[16], w[17], offset); w[25] = amd_bytealign_S (w[15], w[16], offset); w[24] = amd_bytealign_S (w[14], w[15], offset); w[23] = amd_bytealign_S (w[13], w[14], offset); w[22] = amd_bytealign_S (w[12], w[13], offset); w[21] = amd_bytealign_S (w[11], w[12], offset); w[20] = amd_bytealign_S (w[10], w[11], offset); w[19] = amd_bytealign_S (w[ 9], w[10], offset); w[18] = amd_bytealign_S (w[ 8], w[ 9], offset); w[17] = amd_bytealign_S (w[ 7], w[ 8], offset); w[16] = amd_bytealign_S (w[ 6], w[ 7], offset); w[15] = amd_bytealign_S (w[ 5], w[ 6], offset); w[14] = amd_bytealign_S (w[ 4], w[ 5], offset); w[13] = amd_bytealign_S (w[ 3], w[ 4], offset); w[12] = amd_bytealign_S (w[ 2], w[ 3], offset); w[11] = amd_bytealign_S (w[ 1], w[ 2], offset); w[10] = amd_bytealign_S (w[ 0], w[ 1], offset); w[ 9] = amd_bytealign_S ( 0, w[ 0], offset); w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 10: w[63] = amd_bytealign_S (w[52], w[53], offset); w[62] = amd_bytealign_S (w[51], w[52], offset); w[61] = amd_bytealign_S (w[50], w[51], offset); w[60] = amd_bytealign_S (w[49], w[50], offset); w[59] = amd_bytealign_S (w[48], w[49], offset); w[58] = amd_bytealign_S (w[47], w[48], offset); w[57] = amd_bytealign_S (w[46], w[47], offset); w[56] = amd_bytealign_S (w[45], w[46], offset); w[55] = amd_bytealign_S (w[44], w[45], offset); w[54] = amd_bytealign_S (w[43], w[44], offset); w[53] = amd_bytealign_S (w[42], w[43], offset); w[52] = amd_bytealign_S (w[41], w[42], offset); w[51] = amd_bytealign_S (w[40], w[41], offset); w[50] = amd_bytealign_S (w[39], w[40], offset); w[49] = amd_bytealign_S (w[38], w[39], offset); w[48] = amd_bytealign_S (w[37], w[38], offset); w[47] = amd_bytealign_S (w[36], w[37], offset); w[46] = amd_bytealign_S (w[35], w[36], offset); w[45] = amd_bytealign_S (w[34], w[35], offset); w[44] = amd_bytealign_S (w[33], w[34], offset); w[43] = amd_bytealign_S (w[32], w[33], offset); w[42] = amd_bytealign_S (w[31], w[32], offset); w[41] = amd_bytealign_S (w[30], w[31], offset); w[40] = amd_bytealign_S (w[29], w[30], offset); w[39] = amd_bytealign_S (w[28], w[29], offset); w[38] = amd_bytealign_S (w[27], w[28], offset); w[37] = amd_bytealign_S (w[26], w[27], offset); w[36] = amd_bytealign_S (w[25], w[26], offset); w[35] = amd_bytealign_S (w[24], w[25], offset); w[34] = amd_bytealign_S (w[23], w[24], offset); w[33] = amd_bytealign_S (w[22], w[23], offset); w[32] = amd_bytealign_S (w[21], w[22], offset); w[31] = amd_bytealign_S (w[20], w[21], offset); w[30] = amd_bytealign_S (w[19], w[20], offset); w[29] = amd_bytealign_S (w[18], w[19], offset); w[28] = amd_bytealign_S (w[17], w[18], offset); w[27] = amd_bytealign_S (w[16], w[17], offset); w[26] = amd_bytealign_S (w[15], w[16], offset); w[25] = amd_bytealign_S (w[14], w[15], offset); w[24] = amd_bytealign_S (w[13], w[14], offset); w[23] = amd_bytealign_S (w[12], w[13], offset); w[22] = amd_bytealign_S (w[11], w[12], offset); w[21] = amd_bytealign_S (w[10], w[11], offset); w[20] = amd_bytealign_S (w[ 9], w[10], offset); w[19] = amd_bytealign_S (w[ 8], w[ 9], offset); w[18] = amd_bytealign_S (w[ 7], w[ 8], offset); w[17] = amd_bytealign_S (w[ 6], w[ 7], offset); w[16] = amd_bytealign_S (w[ 5], w[ 6], offset); w[15] = amd_bytealign_S (w[ 4], w[ 5], offset); w[14] = amd_bytealign_S (w[ 3], w[ 4], offset); w[13] = amd_bytealign_S (w[ 2], w[ 3], offset); w[12] = amd_bytealign_S (w[ 1], w[ 2], offset); w[11] = amd_bytealign_S (w[ 0], w[ 1], offset); w[10] = amd_bytealign_S ( 0, w[ 0], offset); w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 11: w[63] = amd_bytealign_S (w[51], w[52], offset); w[62] = amd_bytealign_S (w[50], w[51], offset); w[61] = amd_bytealign_S (w[49], w[50], offset); w[60] = amd_bytealign_S (w[48], w[49], offset); w[59] = amd_bytealign_S (w[47], w[48], offset); w[58] = amd_bytealign_S (w[46], w[47], offset); w[57] = amd_bytealign_S (w[45], w[46], offset); w[56] = amd_bytealign_S (w[44], w[45], offset); w[55] = amd_bytealign_S (w[43], w[44], offset); w[54] = amd_bytealign_S (w[42], w[43], offset); w[53] = amd_bytealign_S (w[41], w[42], offset); w[52] = amd_bytealign_S (w[40], w[41], offset); w[51] = amd_bytealign_S (w[39], w[40], offset); w[50] = amd_bytealign_S (w[38], w[39], offset); w[49] = amd_bytealign_S (w[37], w[38], offset); w[48] = amd_bytealign_S (w[36], w[37], offset); w[47] = amd_bytealign_S (w[35], w[36], offset); w[46] = amd_bytealign_S (w[34], w[35], offset); w[45] = amd_bytealign_S (w[33], w[34], offset); w[44] = amd_bytealign_S (w[32], w[33], offset); w[43] = amd_bytealign_S (w[31], w[32], offset); w[42] = amd_bytealign_S (w[30], w[31], offset); w[41] = amd_bytealign_S (w[29], w[30], offset); w[40] = amd_bytealign_S (w[28], w[29], offset); w[39] = amd_bytealign_S (w[27], w[28], offset); w[38] = amd_bytealign_S (w[26], w[27], offset); w[37] = amd_bytealign_S (w[25], w[26], offset); w[36] = amd_bytealign_S (w[24], w[25], offset); w[35] = amd_bytealign_S (w[23], w[24], offset); w[34] = amd_bytealign_S (w[22], w[23], offset); w[33] = amd_bytealign_S (w[21], w[22], offset); w[32] = amd_bytealign_S (w[20], w[21], offset); w[31] = amd_bytealign_S (w[19], w[20], offset); w[30] = amd_bytealign_S (w[18], w[19], offset); w[29] = amd_bytealign_S (w[17], w[18], offset); w[28] = amd_bytealign_S (w[16], w[17], offset); w[27] = amd_bytealign_S (w[15], w[16], offset); w[26] = amd_bytealign_S (w[14], w[15], offset); w[25] = amd_bytealign_S (w[13], w[14], offset); w[24] = amd_bytealign_S (w[12], w[13], offset); w[23] = amd_bytealign_S (w[11], w[12], offset); w[22] = amd_bytealign_S (w[10], w[11], offset); w[21] = amd_bytealign_S (w[ 9], w[10], offset); w[20] = amd_bytealign_S (w[ 8], w[ 9], offset); w[19] = amd_bytealign_S (w[ 7], w[ 8], offset); w[18] = amd_bytealign_S (w[ 6], w[ 7], offset); w[17] = amd_bytealign_S (w[ 5], w[ 6], offset); w[16] = amd_bytealign_S (w[ 4], w[ 5], offset); w[15] = amd_bytealign_S (w[ 3], w[ 4], offset); w[14] = amd_bytealign_S (w[ 2], w[ 3], offset); w[13] = amd_bytealign_S (w[ 1], w[ 2], offset); w[12] = amd_bytealign_S (w[ 0], w[ 1], offset); w[11] = amd_bytealign_S ( 0, w[ 0], offset); w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 12: w[63] = amd_bytealign_S (w[50], w[51], offset); w[62] = amd_bytealign_S (w[49], w[50], offset); w[61] = amd_bytealign_S (w[48], w[49], offset); w[60] = amd_bytealign_S (w[47], w[48], offset); w[59] = amd_bytealign_S (w[46], w[47], offset); w[58] = amd_bytealign_S (w[45], w[46], offset); w[57] = amd_bytealign_S (w[44], w[45], offset); w[56] = amd_bytealign_S (w[43], w[44], offset); w[55] = amd_bytealign_S (w[42], w[43], offset); w[54] = amd_bytealign_S (w[41], w[42], offset); w[53] = amd_bytealign_S (w[40], w[41], offset); w[52] = amd_bytealign_S (w[39], w[40], offset); w[51] = amd_bytealign_S (w[38], w[39], offset); w[50] = amd_bytealign_S (w[37], w[38], offset); w[49] = amd_bytealign_S (w[36], w[37], offset); w[48] = amd_bytealign_S (w[35], w[36], offset); w[47] = amd_bytealign_S (w[34], w[35], offset); w[46] = amd_bytealign_S (w[33], w[34], offset); w[45] = amd_bytealign_S (w[32], w[33], offset); w[44] = amd_bytealign_S (w[31], w[32], offset); w[43] = amd_bytealign_S (w[30], w[31], offset); w[42] = amd_bytealign_S (w[29], w[30], offset); w[41] = amd_bytealign_S (w[28], w[29], offset); w[40] = amd_bytealign_S (w[27], w[28], offset); w[39] = amd_bytealign_S (w[26], w[27], offset); w[38] = amd_bytealign_S (w[25], w[26], offset); w[37] = amd_bytealign_S (w[24], w[25], offset); w[36] = amd_bytealign_S (w[23], w[24], offset); w[35] = amd_bytealign_S (w[22], w[23], offset); w[34] = amd_bytealign_S (w[21], w[22], offset); w[33] = amd_bytealign_S (w[20], w[21], offset); w[32] = amd_bytealign_S (w[19], w[20], offset); w[31] = amd_bytealign_S (w[18], w[19], offset); w[30] = amd_bytealign_S (w[17], w[18], offset); w[29] = amd_bytealign_S (w[16], w[17], offset); w[28] = amd_bytealign_S (w[15], w[16], offset); w[27] = amd_bytealign_S (w[14], w[15], offset); w[26] = amd_bytealign_S (w[13], w[14], offset); w[25] = amd_bytealign_S (w[12], w[13], offset); w[24] = amd_bytealign_S (w[11], w[12], offset); w[23] = amd_bytealign_S (w[10], w[11], offset); w[22] = amd_bytealign_S (w[ 9], w[10], offset); w[21] = amd_bytealign_S (w[ 8], w[ 9], offset); w[20] = amd_bytealign_S (w[ 7], w[ 8], offset); w[19] = amd_bytealign_S (w[ 6], w[ 7], offset); w[18] = amd_bytealign_S (w[ 5], w[ 6], offset); w[17] = amd_bytealign_S (w[ 4], w[ 5], offset); w[16] = amd_bytealign_S (w[ 3], w[ 4], offset); w[15] = amd_bytealign_S (w[ 2], w[ 3], offset); w[14] = amd_bytealign_S (w[ 1], w[ 2], offset); w[13] = amd_bytealign_S (w[ 0], w[ 1], offset); w[12] = amd_bytealign_S ( 0, w[ 0], offset); w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 13: w[63] = amd_bytealign_S (w[49], w[50], offset); w[62] = amd_bytealign_S (w[48], w[49], offset); w[61] = amd_bytealign_S (w[47], w[48], offset); w[60] = amd_bytealign_S (w[46], w[47], offset); w[59] = amd_bytealign_S (w[45], w[46], offset); w[58] = amd_bytealign_S (w[44], w[45], offset); w[57] = amd_bytealign_S (w[43], w[44], offset); w[56] = amd_bytealign_S (w[42], w[43], offset); w[55] = amd_bytealign_S (w[41], w[42], offset); w[54] = amd_bytealign_S (w[40], w[41], offset); w[53] = amd_bytealign_S (w[39], w[40], offset); w[52] = amd_bytealign_S (w[38], w[39], offset); w[51] = amd_bytealign_S (w[37], w[38], offset); w[50] = amd_bytealign_S (w[36], w[37], offset); w[49] = amd_bytealign_S (w[35], w[36], offset); w[48] = amd_bytealign_S (w[34], w[35], offset); w[47] = amd_bytealign_S (w[33], w[34], offset); w[46] = amd_bytealign_S (w[32], w[33], offset); w[45] = amd_bytealign_S (w[31], w[32], offset); w[44] = amd_bytealign_S (w[30], w[31], offset); w[43] = amd_bytealign_S (w[29], w[30], offset); w[42] = amd_bytealign_S (w[28], w[29], offset); w[41] = amd_bytealign_S (w[27], w[28], offset); w[40] = amd_bytealign_S (w[26], w[27], offset); w[39] = amd_bytealign_S (w[25], w[26], offset); w[38] = amd_bytealign_S (w[24], w[25], offset); w[37] = amd_bytealign_S (w[23], w[24], offset); w[36] = amd_bytealign_S (w[22], w[23], offset); w[35] = amd_bytealign_S (w[21], w[22], offset); w[34] = amd_bytealign_S (w[20], w[21], offset); w[33] = amd_bytealign_S (w[19], w[20], offset); w[32] = amd_bytealign_S (w[18], w[19], offset); w[31] = amd_bytealign_S (w[17], w[18], offset); w[30] = amd_bytealign_S (w[16], w[17], offset); w[29] = amd_bytealign_S (w[15], w[16], offset); w[28] = amd_bytealign_S (w[14], w[15], offset); w[27] = amd_bytealign_S (w[13], w[14], offset); w[26] = amd_bytealign_S (w[12], w[13], offset); w[25] = amd_bytealign_S (w[11], w[12], offset); w[24] = amd_bytealign_S (w[10], w[11], offset); w[23] = amd_bytealign_S (w[ 9], w[10], offset); w[22] = amd_bytealign_S (w[ 8], w[ 9], offset); w[21] = amd_bytealign_S (w[ 7], w[ 8], offset); w[20] = amd_bytealign_S (w[ 6], w[ 7], offset); w[19] = amd_bytealign_S (w[ 5], w[ 6], offset); w[18] = amd_bytealign_S (w[ 4], w[ 5], offset); w[17] = amd_bytealign_S (w[ 3], w[ 4], offset); w[16] = amd_bytealign_S (w[ 2], w[ 3], offset); w[15] = amd_bytealign_S (w[ 1], w[ 2], offset); w[14] = amd_bytealign_S (w[ 0], w[ 1], offset); w[13] = amd_bytealign_S ( 0, w[ 0], offset); w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 14: w[63] = amd_bytealign_S (w[48], w[49], offset); w[62] = amd_bytealign_S (w[47], w[48], offset); w[61] = amd_bytealign_S (w[46], w[47], offset); w[60] = amd_bytealign_S (w[45], w[46], offset); w[59] = amd_bytealign_S (w[44], w[45], offset); w[58] = amd_bytealign_S (w[43], w[44], offset); w[57] = amd_bytealign_S (w[42], w[43], offset); w[56] = amd_bytealign_S (w[41], w[42], offset); w[55] = amd_bytealign_S (w[40], w[41], offset); w[54] = amd_bytealign_S (w[39], w[40], offset); w[53] = amd_bytealign_S (w[38], w[39], offset); w[52] = amd_bytealign_S (w[37], w[38], offset); w[51] = amd_bytealign_S (w[36], w[37], offset); w[50] = amd_bytealign_S (w[35], w[36], offset); w[49] = amd_bytealign_S (w[34], w[35], offset); w[48] = amd_bytealign_S (w[33], w[34], offset); w[47] = amd_bytealign_S (w[32], w[33], offset); w[46] = amd_bytealign_S (w[31], w[32], offset); w[45] = amd_bytealign_S (w[30], w[31], offset); w[44] = amd_bytealign_S (w[29], w[30], offset); w[43] = amd_bytealign_S (w[28], w[29], offset); w[42] = amd_bytealign_S (w[27], w[28], offset); w[41] = amd_bytealign_S (w[26], w[27], offset); w[40] = amd_bytealign_S (w[25], w[26], offset); w[39] = amd_bytealign_S (w[24], w[25], offset); w[38] = amd_bytealign_S (w[23], w[24], offset); w[37] = amd_bytealign_S (w[22], w[23], offset); w[36] = amd_bytealign_S (w[21], w[22], offset); w[35] = amd_bytealign_S (w[20], w[21], offset); w[34] = amd_bytealign_S (w[19], w[20], offset); w[33] = amd_bytealign_S (w[18], w[19], offset); w[32] = amd_bytealign_S (w[17], w[18], offset); w[31] = amd_bytealign_S (w[16], w[17], offset); w[30] = amd_bytealign_S (w[15], w[16], offset); w[29] = amd_bytealign_S (w[14], w[15], offset); w[28] = amd_bytealign_S (w[13], w[14], offset); w[27] = amd_bytealign_S (w[12], w[13], offset); w[26] = amd_bytealign_S (w[11], w[12], offset); w[25] = amd_bytealign_S (w[10], w[11], offset); w[24] = amd_bytealign_S (w[ 9], w[10], offset); w[23] = amd_bytealign_S (w[ 8], w[ 9], offset); w[22] = amd_bytealign_S (w[ 7], w[ 8], offset); w[21] = amd_bytealign_S (w[ 6], w[ 7], offset); w[20] = amd_bytealign_S (w[ 5], w[ 6], offset); w[19] = amd_bytealign_S (w[ 4], w[ 5], offset); w[18] = amd_bytealign_S (w[ 3], w[ 4], offset); w[17] = amd_bytealign_S (w[ 2], w[ 3], offset); w[16] = amd_bytealign_S (w[ 1], w[ 2], offset); w[15] = amd_bytealign_S (w[ 0], w[ 1], offset); w[14] = amd_bytealign_S ( 0, w[ 0], offset); w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 15: w[63] = amd_bytealign_S (w[47], w[48], offset); w[62] = amd_bytealign_S (w[46], w[47], offset); w[61] = amd_bytealign_S (w[45], w[46], offset); w[60] = amd_bytealign_S (w[44], w[45], offset); w[59] = amd_bytealign_S (w[43], w[44], offset); w[58] = amd_bytealign_S (w[42], w[43], offset); w[57] = amd_bytealign_S (w[41], w[42], offset); w[56] = amd_bytealign_S (w[40], w[41], offset); w[55] = amd_bytealign_S (w[39], w[40], offset); w[54] = amd_bytealign_S (w[38], w[39], offset); w[53] = amd_bytealign_S (w[37], w[38], offset); w[52] = amd_bytealign_S (w[36], w[37], offset); w[51] = amd_bytealign_S (w[35], w[36], offset); w[50] = amd_bytealign_S (w[34], w[35], offset); w[49] = amd_bytealign_S (w[33], w[34], offset); w[48] = amd_bytealign_S (w[32], w[33], offset); w[47] = amd_bytealign_S (w[31], w[32], offset); w[46] = amd_bytealign_S (w[30], w[31], offset); w[45] = amd_bytealign_S (w[29], w[30], offset); w[44] = amd_bytealign_S (w[28], w[29], offset); w[43] = amd_bytealign_S (w[27], w[28], offset); w[42] = amd_bytealign_S (w[26], w[27], offset); w[41] = amd_bytealign_S (w[25], w[26], offset); w[40] = amd_bytealign_S (w[24], w[25], offset); w[39] = amd_bytealign_S (w[23], w[24], offset); w[38] = amd_bytealign_S (w[22], w[23], offset); w[37] = amd_bytealign_S (w[21], w[22], offset); w[36] = amd_bytealign_S (w[20], w[21], offset); w[35] = amd_bytealign_S (w[19], w[20], offset); w[34] = amd_bytealign_S (w[18], w[19], offset); w[33] = amd_bytealign_S (w[17], w[18], offset); w[32] = amd_bytealign_S (w[16], w[17], offset); w[31] = amd_bytealign_S (w[15], w[16], offset); w[30] = amd_bytealign_S (w[14], w[15], offset); w[29] = amd_bytealign_S (w[13], w[14], offset); w[28] = amd_bytealign_S (w[12], w[13], offset); w[27] = amd_bytealign_S (w[11], w[12], offset); w[26] = amd_bytealign_S (w[10], w[11], offset); w[25] = amd_bytealign_S (w[ 9], w[10], offset); w[24] = amd_bytealign_S (w[ 8], w[ 9], offset); w[23] = amd_bytealign_S (w[ 7], w[ 8], offset); w[22] = amd_bytealign_S (w[ 6], w[ 7], offset); w[21] = amd_bytealign_S (w[ 5], w[ 6], offset); w[20] = amd_bytealign_S (w[ 4], w[ 5], offset); w[19] = amd_bytealign_S (w[ 3], w[ 4], offset); w[18] = amd_bytealign_S (w[ 2], w[ 3], offset); w[17] = amd_bytealign_S (w[ 1], w[ 2], offset); w[16] = amd_bytealign_S (w[ 0], w[ 1], offset); w[15] = amd_bytealign_S ( 0, w[ 0], offset); w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 16: w[63] = amd_bytealign_S (w[46], w[47], offset); w[62] = amd_bytealign_S (w[45], w[46], offset); w[61] = amd_bytealign_S (w[44], w[45], offset); w[60] = amd_bytealign_S (w[43], w[44], offset); w[59] = amd_bytealign_S (w[42], w[43], offset); w[58] = amd_bytealign_S (w[41], w[42], offset); w[57] = amd_bytealign_S (w[40], w[41], offset); w[56] = amd_bytealign_S (w[39], w[40], offset); w[55] = amd_bytealign_S (w[38], w[39], offset); w[54] = amd_bytealign_S (w[37], w[38], offset); w[53] = amd_bytealign_S (w[36], w[37], offset); w[52] = amd_bytealign_S (w[35], w[36], offset); w[51] = amd_bytealign_S (w[34], w[35], offset); w[50] = amd_bytealign_S (w[33], w[34], offset); w[49] = amd_bytealign_S (w[32], w[33], offset); w[48] = amd_bytealign_S (w[31], w[32], offset); w[47] = amd_bytealign_S (w[30], w[31], offset); w[46] = amd_bytealign_S (w[29], w[30], offset); w[45] = amd_bytealign_S (w[28], w[29], offset); w[44] = amd_bytealign_S (w[27], w[28], offset); w[43] = amd_bytealign_S (w[26], w[27], offset); w[42] = amd_bytealign_S (w[25], w[26], offset); w[41] = amd_bytealign_S (w[24], w[25], offset); w[40] = amd_bytealign_S (w[23], w[24], offset); w[39] = amd_bytealign_S (w[22], w[23], offset); w[38] = amd_bytealign_S (w[21], w[22], offset); w[37] = amd_bytealign_S (w[20], w[21], offset); w[36] = amd_bytealign_S (w[19], w[20], offset); w[35] = amd_bytealign_S (w[18], w[19], offset); w[34] = amd_bytealign_S (w[17], w[18], offset); w[33] = amd_bytealign_S (w[16], w[17], offset); w[32] = amd_bytealign_S (w[15], w[16], offset); w[31] = amd_bytealign_S (w[14], w[15], offset); w[30] = amd_bytealign_S (w[13], w[14], offset); w[29] = amd_bytealign_S (w[12], w[13], offset); w[28] = amd_bytealign_S (w[11], w[12], offset); w[27] = amd_bytealign_S (w[10], w[11], offset); w[26] = amd_bytealign_S (w[ 9], w[10], offset); w[25] = amd_bytealign_S (w[ 8], w[ 9], offset); w[24] = amd_bytealign_S (w[ 7], w[ 8], offset); w[23] = amd_bytealign_S (w[ 6], w[ 7], offset); w[22] = amd_bytealign_S (w[ 5], w[ 6], offset); w[21] = amd_bytealign_S (w[ 4], w[ 5], offset); w[20] = amd_bytealign_S (w[ 3], w[ 4], offset); w[19] = amd_bytealign_S (w[ 2], w[ 3], offset); w[18] = amd_bytealign_S (w[ 1], w[ 2], offset); w[17] = amd_bytealign_S (w[ 0], w[ 1], offset); w[16] = amd_bytealign_S ( 0, w[ 0], offset); w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 17: w[63] = amd_bytealign_S (w[45], w[46], offset); w[62] = amd_bytealign_S (w[44], w[45], offset); w[61] = amd_bytealign_S (w[43], w[44], offset); w[60] = amd_bytealign_S (w[42], w[43], offset); w[59] = amd_bytealign_S (w[41], w[42], offset); w[58] = amd_bytealign_S (w[40], w[41], offset); w[57] = amd_bytealign_S (w[39], w[40], offset); w[56] = amd_bytealign_S (w[38], w[39], offset); w[55] = amd_bytealign_S (w[37], w[38], offset); w[54] = amd_bytealign_S (w[36], w[37], offset); w[53] = amd_bytealign_S (w[35], w[36], offset); w[52] = amd_bytealign_S (w[34], w[35], offset); w[51] = amd_bytealign_S (w[33], w[34], offset); w[50] = amd_bytealign_S (w[32], w[33], offset); w[49] = amd_bytealign_S (w[31], w[32], offset); w[48] = amd_bytealign_S (w[30], w[31], offset); w[47] = amd_bytealign_S (w[29], w[30], offset); w[46] = amd_bytealign_S (w[28], w[29], offset); w[45] = amd_bytealign_S (w[27], w[28], offset); w[44] = amd_bytealign_S (w[26], w[27], offset); w[43] = amd_bytealign_S (w[25], w[26], offset); w[42] = amd_bytealign_S (w[24], w[25], offset); w[41] = amd_bytealign_S (w[23], w[24], offset); w[40] = amd_bytealign_S (w[22], w[23], offset); w[39] = amd_bytealign_S (w[21], w[22], offset); w[38] = amd_bytealign_S (w[20], w[21], offset); w[37] = amd_bytealign_S (w[19], w[20], offset); w[36] = amd_bytealign_S (w[18], w[19], offset); w[35] = amd_bytealign_S (w[17], w[18], offset); w[34] = amd_bytealign_S (w[16], w[17], offset); w[33] = amd_bytealign_S (w[15], w[16], offset); w[32] = amd_bytealign_S (w[14], w[15], offset); w[31] = amd_bytealign_S (w[13], w[14], offset); w[30] = amd_bytealign_S (w[12], w[13], offset); w[29] = amd_bytealign_S (w[11], w[12], offset); w[28] = amd_bytealign_S (w[10], w[11], offset); w[27] = amd_bytealign_S (w[ 9], w[10], offset); w[26] = amd_bytealign_S (w[ 8], w[ 9], offset); w[25] = amd_bytealign_S (w[ 7], w[ 8], offset); w[24] = amd_bytealign_S (w[ 6], w[ 7], offset); w[23] = amd_bytealign_S (w[ 5], w[ 6], offset); w[22] = amd_bytealign_S (w[ 4], w[ 5], offset); w[21] = amd_bytealign_S (w[ 3], w[ 4], offset); w[20] = amd_bytealign_S (w[ 2], w[ 3], offset); w[19] = amd_bytealign_S (w[ 1], w[ 2], offset); w[18] = amd_bytealign_S (w[ 0], w[ 1], offset); w[17] = amd_bytealign_S ( 0, w[ 0], offset); w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 18: w[63] = amd_bytealign_S (w[44], w[45], offset); w[62] = amd_bytealign_S (w[43], w[44], offset); w[61] = amd_bytealign_S (w[42], w[43], offset); w[60] = amd_bytealign_S (w[41], w[42], offset); w[59] = amd_bytealign_S (w[40], w[41], offset); w[58] = amd_bytealign_S (w[39], w[40], offset); w[57] = amd_bytealign_S (w[38], w[39], offset); w[56] = amd_bytealign_S (w[37], w[38], offset); w[55] = amd_bytealign_S (w[36], w[37], offset); w[54] = amd_bytealign_S (w[35], w[36], offset); w[53] = amd_bytealign_S (w[34], w[35], offset); w[52] = amd_bytealign_S (w[33], w[34], offset); w[51] = amd_bytealign_S (w[32], w[33], offset); w[50] = amd_bytealign_S (w[31], w[32], offset); w[49] = amd_bytealign_S (w[30], w[31], offset); w[48] = amd_bytealign_S (w[29], w[30], offset); w[47] = amd_bytealign_S (w[28], w[29], offset); w[46] = amd_bytealign_S (w[27], w[28], offset); w[45] = amd_bytealign_S (w[26], w[27], offset); w[44] = amd_bytealign_S (w[25], w[26], offset); w[43] = amd_bytealign_S (w[24], w[25], offset); w[42] = amd_bytealign_S (w[23], w[24], offset); w[41] = amd_bytealign_S (w[22], w[23], offset); w[40] = amd_bytealign_S (w[21], w[22], offset); w[39] = amd_bytealign_S (w[20], w[21], offset); w[38] = amd_bytealign_S (w[19], w[20], offset); w[37] = amd_bytealign_S (w[18], w[19], offset); w[36] = amd_bytealign_S (w[17], w[18], offset); w[35] = amd_bytealign_S (w[16], w[17], offset); w[34] = amd_bytealign_S (w[15], w[16], offset); w[33] = amd_bytealign_S (w[14], w[15], offset); w[32] = amd_bytealign_S (w[13], w[14], offset); w[31] = amd_bytealign_S (w[12], w[13], offset); w[30] = amd_bytealign_S (w[11], w[12], offset); w[29] = amd_bytealign_S (w[10], w[11], offset); w[28] = amd_bytealign_S (w[ 9], w[10], offset); w[27] = amd_bytealign_S (w[ 8], w[ 9], offset); w[26] = amd_bytealign_S (w[ 7], w[ 8], offset); w[25] = amd_bytealign_S (w[ 6], w[ 7], offset); w[24] = amd_bytealign_S (w[ 5], w[ 6], offset); w[23] = amd_bytealign_S (w[ 4], w[ 5], offset); w[22] = amd_bytealign_S (w[ 3], w[ 4], offset); w[21] = amd_bytealign_S (w[ 2], w[ 3], offset); w[20] = amd_bytealign_S (w[ 1], w[ 2], offset); w[19] = amd_bytealign_S (w[ 0], w[ 1], offset); w[18] = amd_bytealign_S ( 0, w[ 0], offset); w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 19: w[63] = amd_bytealign_S (w[43], w[44], offset); w[62] = amd_bytealign_S (w[42], w[43], offset); w[61] = amd_bytealign_S (w[41], w[42], offset); w[60] = amd_bytealign_S (w[40], w[41], offset); w[59] = amd_bytealign_S (w[39], w[40], offset); w[58] = amd_bytealign_S (w[38], w[39], offset); w[57] = amd_bytealign_S (w[37], w[38], offset); w[56] = amd_bytealign_S (w[36], w[37], offset); w[55] = amd_bytealign_S (w[35], w[36], offset); w[54] = amd_bytealign_S (w[34], w[35], offset); w[53] = amd_bytealign_S (w[33], w[34], offset); w[52] = amd_bytealign_S (w[32], w[33], offset); w[51] = amd_bytealign_S (w[31], w[32], offset); w[50] = amd_bytealign_S (w[30], w[31], offset); w[49] = amd_bytealign_S (w[29], w[30], offset); w[48] = amd_bytealign_S (w[28], w[29], offset); w[47] = amd_bytealign_S (w[27], w[28], offset); w[46] = amd_bytealign_S (w[26], w[27], offset); w[45] = amd_bytealign_S (w[25], w[26], offset); w[44] = amd_bytealign_S (w[24], w[25], offset); w[43] = amd_bytealign_S (w[23], w[24], offset); w[42] = amd_bytealign_S (w[22], w[23], offset); w[41] = amd_bytealign_S (w[21], w[22], offset); w[40] = amd_bytealign_S (w[20], w[21], offset); w[39] = amd_bytealign_S (w[19], w[20], offset); w[38] = amd_bytealign_S (w[18], w[19], offset); w[37] = amd_bytealign_S (w[17], w[18], offset); w[36] = amd_bytealign_S (w[16], w[17], offset); w[35] = amd_bytealign_S (w[15], w[16], offset); w[34] = amd_bytealign_S (w[14], w[15], offset); w[33] = amd_bytealign_S (w[13], w[14], offset); w[32] = amd_bytealign_S (w[12], w[13], offset); w[31] = amd_bytealign_S (w[11], w[12], offset); w[30] = amd_bytealign_S (w[10], w[11], offset); w[29] = amd_bytealign_S (w[ 9], w[10], offset); w[28] = amd_bytealign_S (w[ 8], w[ 9], offset); w[27] = amd_bytealign_S (w[ 7], w[ 8], offset); w[26] = amd_bytealign_S (w[ 6], w[ 7], offset); w[25] = amd_bytealign_S (w[ 5], w[ 6], offset); w[24] = amd_bytealign_S (w[ 4], w[ 5], offset); w[23] = amd_bytealign_S (w[ 3], w[ 4], offset); w[22] = amd_bytealign_S (w[ 2], w[ 3], offset); w[21] = amd_bytealign_S (w[ 1], w[ 2], offset); w[20] = amd_bytealign_S (w[ 0], w[ 1], offset); w[19] = amd_bytealign_S ( 0, w[ 0], offset); w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 20: w[63] = amd_bytealign_S (w[42], w[43], offset); w[62] = amd_bytealign_S (w[41], w[42], offset); w[61] = amd_bytealign_S (w[40], w[41], offset); w[60] = amd_bytealign_S (w[39], w[40], offset); w[59] = amd_bytealign_S (w[38], w[39], offset); w[58] = amd_bytealign_S (w[37], w[38], offset); w[57] = amd_bytealign_S (w[36], w[37], offset); w[56] = amd_bytealign_S (w[35], w[36], offset); w[55] = amd_bytealign_S (w[34], w[35], offset); w[54] = amd_bytealign_S (w[33], w[34], offset); w[53] = amd_bytealign_S (w[32], w[33], offset); w[52] = amd_bytealign_S (w[31], w[32], offset); w[51] = amd_bytealign_S (w[30], w[31], offset); w[50] = amd_bytealign_S (w[29], w[30], offset); w[49] = amd_bytealign_S (w[28], w[29], offset); w[48] = amd_bytealign_S (w[27], w[28], offset); w[47] = amd_bytealign_S (w[26], w[27], offset); w[46] = amd_bytealign_S (w[25], w[26], offset); w[45] = amd_bytealign_S (w[24], w[25], offset); w[44] = amd_bytealign_S (w[23], w[24], offset); w[43] = amd_bytealign_S (w[22], w[23], offset); w[42] = amd_bytealign_S (w[21], w[22], offset); w[41] = amd_bytealign_S (w[20], w[21], offset); w[40] = amd_bytealign_S (w[19], w[20], offset); w[39] = amd_bytealign_S (w[18], w[19], offset); w[38] = amd_bytealign_S (w[17], w[18], offset); w[37] = amd_bytealign_S (w[16], w[17], offset); w[36] = amd_bytealign_S (w[15], w[16], offset); w[35] = amd_bytealign_S (w[14], w[15], offset); w[34] = amd_bytealign_S (w[13], w[14], offset); w[33] = amd_bytealign_S (w[12], w[13], offset); w[32] = amd_bytealign_S (w[11], w[12], offset); w[31] = amd_bytealign_S (w[10], w[11], offset); w[30] = amd_bytealign_S (w[ 9], w[10], offset); w[29] = amd_bytealign_S (w[ 8], w[ 9], offset); w[28] = amd_bytealign_S (w[ 7], w[ 8], offset); w[27] = amd_bytealign_S (w[ 6], w[ 7], offset); w[26] = amd_bytealign_S (w[ 5], w[ 6], offset); w[25] = amd_bytealign_S (w[ 4], w[ 5], offset); w[24] = amd_bytealign_S (w[ 3], w[ 4], offset); w[23] = amd_bytealign_S (w[ 2], w[ 3], offset); w[22] = amd_bytealign_S (w[ 1], w[ 2], offset); w[21] = amd_bytealign_S (w[ 0], w[ 1], offset); w[20] = amd_bytealign_S ( 0, w[ 0], offset); w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 21: w[63] = amd_bytealign_S (w[41], w[42], offset); w[62] = amd_bytealign_S (w[40], w[41], offset); w[61] = amd_bytealign_S (w[39], w[40], offset); w[60] = amd_bytealign_S (w[38], w[39], offset); w[59] = amd_bytealign_S (w[37], w[38], offset); w[58] = amd_bytealign_S (w[36], w[37], offset); w[57] = amd_bytealign_S (w[35], w[36], offset); w[56] = amd_bytealign_S (w[34], w[35], offset); w[55] = amd_bytealign_S (w[33], w[34], offset); w[54] = amd_bytealign_S (w[32], w[33], offset); w[53] = amd_bytealign_S (w[31], w[32], offset); w[52] = amd_bytealign_S (w[30], w[31], offset); w[51] = amd_bytealign_S (w[29], w[30], offset); w[50] = amd_bytealign_S (w[28], w[29], offset); w[49] = amd_bytealign_S (w[27], w[28], offset); w[48] = amd_bytealign_S (w[26], w[27], offset); w[47] = amd_bytealign_S (w[25], w[26], offset); w[46] = amd_bytealign_S (w[24], w[25], offset); w[45] = amd_bytealign_S (w[23], w[24], offset); w[44] = amd_bytealign_S (w[22], w[23], offset); w[43] = amd_bytealign_S (w[21], w[22], offset); w[42] = amd_bytealign_S (w[20], w[21], offset); w[41] = amd_bytealign_S (w[19], w[20], offset); w[40] = amd_bytealign_S (w[18], w[19], offset); w[39] = amd_bytealign_S (w[17], w[18], offset); w[38] = amd_bytealign_S (w[16], w[17], offset); w[37] = amd_bytealign_S (w[15], w[16], offset); w[36] = amd_bytealign_S (w[14], w[15], offset); w[35] = amd_bytealign_S (w[13], w[14], offset); w[34] = amd_bytealign_S (w[12], w[13], offset); w[33] = amd_bytealign_S (w[11], w[12], offset); w[32] = amd_bytealign_S (w[10], w[11], offset); w[31] = amd_bytealign_S (w[ 9], w[10], offset); w[30] = amd_bytealign_S (w[ 8], w[ 9], offset); w[29] = amd_bytealign_S (w[ 7], w[ 8], offset); w[28] = amd_bytealign_S (w[ 6], w[ 7], offset); w[27] = amd_bytealign_S (w[ 5], w[ 6], offset); w[26] = amd_bytealign_S (w[ 4], w[ 5], offset); w[25] = amd_bytealign_S (w[ 3], w[ 4], offset); w[24] = amd_bytealign_S (w[ 2], w[ 3], offset); w[23] = amd_bytealign_S (w[ 1], w[ 2], offset); w[22] = amd_bytealign_S (w[ 0], w[ 1], offset); w[21] = amd_bytealign_S ( 0, w[ 0], offset); w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 22: w[63] = amd_bytealign_S (w[40], w[41], offset); w[62] = amd_bytealign_S (w[39], w[40], offset); w[61] = amd_bytealign_S (w[38], w[39], offset); w[60] = amd_bytealign_S (w[37], w[38], offset); w[59] = amd_bytealign_S (w[36], w[37], offset); w[58] = amd_bytealign_S (w[35], w[36], offset); w[57] = amd_bytealign_S (w[34], w[35], offset); w[56] = amd_bytealign_S (w[33], w[34], offset); w[55] = amd_bytealign_S (w[32], w[33], offset); w[54] = amd_bytealign_S (w[31], w[32], offset); w[53] = amd_bytealign_S (w[30], w[31], offset); w[52] = amd_bytealign_S (w[29], w[30], offset); w[51] = amd_bytealign_S (w[28], w[29], offset); w[50] = amd_bytealign_S (w[27], w[28], offset); w[49] = amd_bytealign_S (w[26], w[27], offset); w[48] = amd_bytealign_S (w[25], w[26], offset); w[47] = amd_bytealign_S (w[24], w[25], offset); w[46] = amd_bytealign_S (w[23], w[24], offset); w[45] = amd_bytealign_S (w[22], w[23], offset); w[44] = amd_bytealign_S (w[21], w[22], offset); w[43] = amd_bytealign_S (w[20], w[21], offset); w[42] = amd_bytealign_S (w[19], w[20], offset); w[41] = amd_bytealign_S (w[18], w[19], offset); w[40] = amd_bytealign_S (w[17], w[18], offset); w[39] = amd_bytealign_S (w[16], w[17], offset); w[38] = amd_bytealign_S (w[15], w[16], offset); w[37] = amd_bytealign_S (w[14], w[15], offset); w[36] = amd_bytealign_S (w[13], w[14], offset); w[35] = amd_bytealign_S (w[12], w[13], offset); w[34] = amd_bytealign_S (w[11], w[12], offset); w[33] = amd_bytealign_S (w[10], w[11], offset); w[32] = amd_bytealign_S (w[ 9], w[10], offset); w[31] = amd_bytealign_S (w[ 8], w[ 9], offset); w[30] = amd_bytealign_S (w[ 7], w[ 8], offset); w[29] = amd_bytealign_S (w[ 6], w[ 7], offset); w[28] = amd_bytealign_S (w[ 5], w[ 6], offset); w[27] = amd_bytealign_S (w[ 4], w[ 5], offset); w[26] = amd_bytealign_S (w[ 3], w[ 4], offset); w[25] = amd_bytealign_S (w[ 2], w[ 3], offset); w[24] = amd_bytealign_S (w[ 1], w[ 2], offset); w[23] = amd_bytealign_S (w[ 0], w[ 1], offset); w[22] = amd_bytealign_S ( 0, w[ 0], offset); w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 23: w[63] = amd_bytealign_S (w[39], w[40], offset); w[62] = amd_bytealign_S (w[38], w[39], offset); w[61] = amd_bytealign_S (w[37], w[38], offset); w[60] = amd_bytealign_S (w[36], w[37], offset); w[59] = amd_bytealign_S (w[35], w[36], offset); w[58] = amd_bytealign_S (w[34], w[35], offset); w[57] = amd_bytealign_S (w[33], w[34], offset); w[56] = amd_bytealign_S (w[32], w[33], offset); w[55] = amd_bytealign_S (w[31], w[32], offset); w[54] = amd_bytealign_S (w[30], w[31], offset); w[53] = amd_bytealign_S (w[29], w[30], offset); w[52] = amd_bytealign_S (w[28], w[29], offset); w[51] = amd_bytealign_S (w[27], w[28], offset); w[50] = amd_bytealign_S (w[26], w[27], offset); w[49] = amd_bytealign_S (w[25], w[26], offset); w[48] = amd_bytealign_S (w[24], w[25], offset); w[47] = amd_bytealign_S (w[23], w[24], offset); w[46] = amd_bytealign_S (w[22], w[23], offset); w[45] = amd_bytealign_S (w[21], w[22], offset); w[44] = amd_bytealign_S (w[20], w[21], offset); w[43] = amd_bytealign_S (w[19], w[20], offset); w[42] = amd_bytealign_S (w[18], w[19], offset); w[41] = amd_bytealign_S (w[17], w[18], offset); w[40] = amd_bytealign_S (w[16], w[17], offset); w[39] = amd_bytealign_S (w[15], w[16], offset); w[38] = amd_bytealign_S (w[14], w[15], offset); w[37] = amd_bytealign_S (w[13], w[14], offset); w[36] = amd_bytealign_S (w[12], w[13], offset); w[35] = amd_bytealign_S (w[11], w[12], offset); w[34] = amd_bytealign_S (w[10], w[11], offset); w[33] = amd_bytealign_S (w[ 9], w[10], offset); w[32] = amd_bytealign_S (w[ 8], w[ 9], offset); w[31] = amd_bytealign_S (w[ 7], w[ 8], offset); w[30] = amd_bytealign_S (w[ 6], w[ 7], offset); w[29] = amd_bytealign_S (w[ 5], w[ 6], offset); w[28] = amd_bytealign_S (w[ 4], w[ 5], offset); w[27] = amd_bytealign_S (w[ 3], w[ 4], offset); w[26] = amd_bytealign_S (w[ 2], w[ 3], offset); w[25] = amd_bytealign_S (w[ 1], w[ 2], offset); w[24] = amd_bytealign_S (w[ 0], w[ 1], offset); w[23] = amd_bytealign_S ( 0, w[ 0], offset); w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 24: w[63] = amd_bytealign_S (w[38], w[39], offset); w[62] = amd_bytealign_S (w[37], w[38], offset); w[61] = amd_bytealign_S (w[36], w[37], offset); w[60] = amd_bytealign_S (w[35], w[36], offset); w[59] = amd_bytealign_S (w[34], w[35], offset); w[58] = amd_bytealign_S (w[33], w[34], offset); w[57] = amd_bytealign_S (w[32], w[33], offset); w[56] = amd_bytealign_S (w[31], w[32], offset); w[55] = amd_bytealign_S (w[30], w[31], offset); w[54] = amd_bytealign_S (w[29], w[30], offset); w[53] = amd_bytealign_S (w[28], w[29], offset); w[52] = amd_bytealign_S (w[27], w[28], offset); w[51] = amd_bytealign_S (w[26], w[27], offset); w[50] = amd_bytealign_S (w[25], w[26], offset); w[49] = amd_bytealign_S (w[24], w[25], offset); w[48] = amd_bytealign_S (w[23], w[24], offset); w[47] = amd_bytealign_S (w[22], w[23], offset); w[46] = amd_bytealign_S (w[21], w[22], offset); w[45] = amd_bytealign_S (w[20], w[21], offset); w[44] = amd_bytealign_S (w[19], w[20], offset); w[43] = amd_bytealign_S (w[18], w[19], offset); w[42] = amd_bytealign_S (w[17], w[18], offset); w[41] = amd_bytealign_S (w[16], w[17], offset); w[40] = amd_bytealign_S (w[15], w[16], offset); w[39] = amd_bytealign_S (w[14], w[15], offset); w[38] = amd_bytealign_S (w[13], w[14], offset); w[37] = amd_bytealign_S (w[12], w[13], offset); w[36] = amd_bytealign_S (w[11], w[12], offset); w[35] = amd_bytealign_S (w[10], w[11], offset); w[34] = amd_bytealign_S (w[ 9], w[10], offset); w[33] = amd_bytealign_S (w[ 8], w[ 9], offset); w[32] = amd_bytealign_S (w[ 7], w[ 8], offset); w[31] = amd_bytealign_S (w[ 6], w[ 7], offset); w[30] = amd_bytealign_S (w[ 5], w[ 6], offset); w[29] = amd_bytealign_S (w[ 4], w[ 5], offset); w[28] = amd_bytealign_S (w[ 3], w[ 4], offset); w[27] = amd_bytealign_S (w[ 2], w[ 3], offset); w[26] = amd_bytealign_S (w[ 1], w[ 2], offset); w[25] = amd_bytealign_S (w[ 0], w[ 1], offset); w[24] = amd_bytealign_S ( 0, w[ 0], offset); w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 25: w[63] = amd_bytealign_S (w[37], w[38], offset); w[62] = amd_bytealign_S (w[36], w[37], offset); w[61] = amd_bytealign_S (w[35], w[36], offset); w[60] = amd_bytealign_S (w[34], w[35], offset); w[59] = amd_bytealign_S (w[33], w[34], offset); w[58] = amd_bytealign_S (w[32], w[33], offset); w[57] = amd_bytealign_S (w[31], w[32], offset); w[56] = amd_bytealign_S (w[30], w[31], offset); w[55] = amd_bytealign_S (w[29], w[30], offset); w[54] = amd_bytealign_S (w[28], w[29], offset); w[53] = amd_bytealign_S (w[27], w[28], offset); w[52] = amd_bytealign_S (w[26], w[27], offset); w[51] = amd_bytealign_S (w[25], w[26], offset); w[50] = amd_bytealign_S (w[24], w[25], offset); w[49] = amd_bytealign_S (w[23], w[24], offset); w[48] = amd_bytealign_S (w[22], w[23], offset); w[47] = amd_bytealign_S (w[21], w[22], offset); w[46] = amd_bytealign_S (w[20], w[21], offset); w[45] = amd_bytealign_S (w[19], w[20], offset); w[44] = amd_bytealign_S (w[18], w[19], offset); w[43] = amd_bytealign_S (w[17], w[18], offset); w[42] = amd_bytealign_S (w[16], w[17], offset); w[41] = amd_bytealign_S (w[15], w[16], offset); w[40] = amd_bytealign_S (w[14], w[15], offset); w[39] = amd_bytealign_S (w[13], w[14], offset); w[38] = amd_bytealign_S (w[12], w[13], offset); w[37] = amd_bytealign_S (w[11], w[12], offset); w[36] = amd_bytealign_S (w[10], w[11], offset); w[35] = amd_bytealign_S (w[ 9], w[10], offset); w[34] = amd_bytealign_S (w[ 8], w[ 9], offset); w[33] = amd_bytealign_S (w[ 7], w[ 8], offset); w[32] = amd_bytealign_S (w[ 6], w[ 7], offset); w[31] = amd_bytealign_S (w[ 5], w[ 6], offset); w[30] = amd_bytealign_S (w[ 4], w[ 5], offset); w[29] = amd_bytealign_S (w[ 3], w[ 4], offset); w[28] = amd_bytealign_S (w[ 2], w[ 3], offset); w[27] = amd_bytealign_S (w[ 1], w[ 2], offset); w[26] = amd_bytealign_S (w[ 0], w[ 1], offset); w[25] = amd_bytealign_S ( 0, w[ 0], offset); w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 26: w[63] = amd_bytealign_S (w[36], w[37], offset); w[62] = amd_bytealign_S (w[35], w[36], offset); w[61] = amd_bytealign_S (w[34], w[35], offset); w[60] = amd_bytealign_S (w[33], w[34], offset); w[59] = amd_bytealign_S (w[32], w[33], offset); w[58] = amd_bytealign_S (w[31], w[32], offset); w[57] = amd_bytealign_S (w[30], w[31], offset); w[56] = amd_bytealign_S (w[29], w[30], offset); w[55] = amd_bytealign_S (w[28], w[29], offset); w[54] = amd_bytealign_S (w[27], w[28], offset); w[53] = amd_bytealign_S (w[26], w[27], offset); w[52] = amd_bytealign_S (w[25], w[26], offset); w[51] = amd_bytealign_S (w[24], w[25], offset); w[50] = amd_bytealign_S (w[23], w[24], offset); w[49] = amd_bytealign_S (w[22], w[23], offset); w[48] = amd_bytealign_S (w[21], w[22], offset); w[47] = amd_bytealign_S (w[20], w[21], offset); w[46] = amd_bytealign_S (w[19], w[20], offset); w[45] = amd_bytealign_S (w[18], w[19], offset); w[44] = amd_bytealign_S (w[17], w[18], offset); w[43] = amd_bytealign_S (w[16], w[17], offset); w[42] = amd_bytealign_S (w[15], w[16], offset); w[41] = amd_bytealign_S (w[14], w[15], offset); w[40] = amd_bytealign_S (w[13], w[14], offset); w[39] = amd_bytealign_S (w[12], w[13], offset); w[38] = amd_bytealign_S (w[11], w[12], offset); w[37] = amd_bytealign_S (w[10], w[11], offset); w[36] = amd_bytealign_S (w[ 9], w[10], offset); w[35] = amd_bytealign_S (w[ 8], w[ 9], offset); w[34] = amd_bytealign_S (w[ 7], w[ 8], offset); w[33] = amd_bytealign_S (w[ 6], w[ 7], offset); w[32] = amd_bytealign_S (w[ 5], w[ 6], offset); w[31] = amd_bytealign_S (w[ 4], w[ 5], offset); w[30] = amd_bytealign_S (w[ 3], w[ 4], offset); w[29] = amd_bytealign_S (w[ 2], w[ 3], offset); w[28] = amd_bytealign_S (w[ 1], w[ 2], offset); w[27] = amd_bytealign_S (w[ 0], w[ 1], offset); w[26] = amd_bytealign_S ( 0, w[ 0], offset); w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 27: w[63] = amd_bytealign_S (w[35], w[36], offset); w[62] = amd_bytealign_S (w[34], w[35], offset); w[61] = amd_bytealign_S (w[33], w[34], offset); w[60] = amd_bytealign_S (w[32], w[33], offset); w[59] = amd_bytealign_S (w[31], w[32], offset); w[58] = amd_bytealign_S (w[30], w[31], offset); w[57] = amd_bytealign_S (w[29], w[30], offset); w[56] = amd_bytealign_S (w[28], w[29], offset); w[55] = amd_bytealign_S (w[27], w[28], offset); w[54] = amd_bytealign_S (w[26], w[27], offset); w[53] = amd_bytealign_S (w[25], w[26], offset); w[52] = amd_bytealign_S (w[24], w[25], offset); w[51] = amd_bytealign_S (w[23], w[24], offset); w[50] = amd_bytealign_S (w[22], w[23], offset); w[49] = amd_bytealign_S (w[21], w[22], offset); w[48] = amd_bytealign_S (w[20], w[21], offset); w[47] = amd_bytealign_S (w[19], w[20], offset); w[46] = amd_bytealign_S (w[18], w[19], offset); w[45] = amd_bytealign_S (w[17], w[18], offset); w[44] = amd_bytealign_S (w[16], w[17], offset); w[43] = amd_bytealign_S (w[15], w[16], offset); w[42] = amd_bytealign_S (w[14], w[15], offset); w[41] = amd_bytealign_S (w[13], w[14], offset); w[40] = amd_bytealign_S (w[12], w[13], offset); w[39] = amd_bytealign_S (w[11], w[12], offset); w[38] = amd_bytealign_S (w[10], w[11], offset); w[37] = amd_bytealign_S (w[ 9], w[10], offset); w[36] = amd_bytealign_S (w[ 8], w[ 9], offset); w[35] = amd_bytealign_S (w[ 7], w[ 8], offset); w[34] = amd_bytealign_S (w[ 6], w[ 7], offset); w[33] = amd_bytealign_S (w[ 5], w[ 6], offset); w[32] = amd_bytealign_S (w[ 4], w[ 5], offset); w[31] = amd_bytealign_S (w[ 3], w[ 4], offset); w[30] = amd_bytealign_S (w[ 2], w[ 3], offset); w[29] = amd_bytealign_S (w[ 1], w[ 2], offset); w[28] = amd_bytealign_S (w[ 0], w[ 1], offset); w[27] = amd_bytealign_S ( 0, w[ 0], offset); w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 28: w[63] = amd_bytealign_S (w[34], w[35], offset); w[62] = amd_bytealign_S (w[33], w[34], offset); w[61] = amd_bytealign_S (w[32], w[33], offset); w[60] = amd_bytealign_S (w[31], w[32], offset); w[59] = amd_bytealign_S (w[30], w[31], offset); w[58] = amd_bytealign_S (w[29], w[30], offset); w[57] = amd_bytealign_S (w[28], w[29], offset); w[56] = amd_bytealign_S (w[27], w[28], offset); w[55] = amd_bytealign_S (w[26], w[27], offset); w[54] = amd_bytealign_S (w[25], w[26], offset); w[53] = amd_bytealign_S (w[24], w[25], offset); w[52] = amd_bytealign_S (w[23], w[24], offset); w[51] = amd_bytealign_S (w[22], w[23], offset); w[50] = amd_bytealign_S (w[21], w[22], offset); w[49] = amd_bytealign_S (w[20], w[21], offset); w[48] = amd_bytealign_S (w[19], w[20], offset); w[47] = amd_bytealign_S (w[18], w[19], offset); w[46] = amd_bytealign_S (w[17], w[18], offset); w[45] = amd_bytealign_S (w[16], w[17], offset); w[44] = amd_bytealign_S (w[15], w[16], offset); w[43] = amd_bytealign_S (w[14], w[15], offset); w[42] = amd_bytealign_S (w[13], w[14], offset); w[41] = amd_bytealign_S (w[12], w[13], offset); w[40] = amd_bytealign_S (w[11], w[12], offset); w[39] = amd_bytealign_S (w[10], w[11], offset); w[38] = amd_bytealign_S (w[ 9], w[10], offset); w[37] = amd_bytealign_S (w[ 8], w[ 9], offset); w[36] = amd_bytealign_S (w[ 7], w[ 8], offset); w[35] = amd_bytealign_S (w[ 6], w[ 7], offset); w[34] = amd_bytealign_S (w[ 5], w[ 6], offset); w[33] = amd_bytealign_S (w[ 4], w[ 5], offset); w[32] = amd_bytealign_S (w[ 3], w[ 4], offset); w[31] = amd_bytealign_S (w[ 2], w[ 3], offset); w[30] = amd_bytealign_S (w[ 1], w[ 2], offset); w[29] = amd_bytealign_S (w[ 0], w[ 1], offset); w[28] = amd_bytealign_S ( 0, w[ 0], offset); w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 29: w[63] = amd_bytealign_S (w[33], w[34], offset); w[62] = amd_bytealign_S (w[32], w[33], offset); w[61] = amd_bytealign_S (w[31], w[32], offset); w[60] = amd_bytealign_S (w[30], w[31], offset); w[59] = amd_bytealign_S (w[29], w[30], offset); w[58] = amd_bytealign_S (w[28], w[29], offset); w[57] = amd_bytealign_S (w[27], w[28], offset); w[56] = amd_bytealign_S (w[26], w[27], offset); w[55] = amd_bytealign_S (w[25], w[26], offset); w[54] = amd_bytealign_S (w[24], w[25], offset); w[53] = amd_bytealign_S (w[23], w[24], offset); w[52] = amd_bytealign_S (w[22], w[23], offset); w[51] = amd_bytealign_S (w[21], w[22], offset); w[50] = amd_bytealign_S (w[20], w[21], offset); w[49] = amd_bytealign_S (w[19], w[20], offset); w[48] = amd_bytealign_S (w[18], w[19], offset); w[47] = amd_bytealign_S (w[17], w[18], offset); w[46] = amd_bytealign_S (w[16], w[17], offset); w[45] = amd_bytealign_S (w[15], w[16], offset); w[44] = amd_bytealign_S (w[14], w[15], offset); w[43] = amd_bytealign_S (w[13], w[14], offset); w[42] = amd_bytealign_S (w[12], w[13], offset); w[41] = amd_bytealign_S (w[11], w[12], offset); w[40] = amd_bytealign_S (w[10], w[11], offset); w[39] = amd_bytealign_S (w[ 9], w[10], offset); w[38] = amd_bytealign_S (w[ 8], w[ 9], offset); w[37] = amd_bytealign_S (w[ 7], w[ 8], offset); w[36] = amd_bytealign_S (w[ 6], w[ 7], offset); w[35] = amd_bytealign_S (w[ 5], w[ 6], offset); w[34] = amd_bytealign_S (w[ 4], w[ 5], offset); w[33] = amd_bytealign_S (w[ 3], w[ 4], offset); w[32] = amd_bytealign_S (w[ 2], w[ 3], offset); w[31] = amd_bytealign_S (w[ 1], w[ 2], offset); w[30] = amd_bytealign_S (w[ 0], w[ 1], offset); w[29] = amd_bytealign_S ( 0, w[ 0], offset); w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 30: w[63] = amd_bytealign_S (w[32], w[33], offset); w[62] = amd_bytealign_S (w[31], w[32], offset); w[61] = amd_bytealign_S (w[30], w[31], offset); w[60] = amd_bytealign_S (w[29], w[30], offset); w[59] = amd_bytealign_S (w[28], w[29], offset); w[58] = amd_bytealign_S (w[27], w[28], offset); w[57] = amd_bytealign_S (w[26], w[27], offset); w[56] = amd_bytealign_S (w[25], w[26], offset); w[55] = amd_bytealign_S (w[24], w[25], offset); w[54] = amd_bytealign_S (w[23], w[24], offset); w[53] = amd_bytealign_S (w[22], w[23], offset); w[52] = amd_bytealign_S (w[21], w[22], offset); w[51] = amd_bytealign_S (w[20], w[21], offset); w[50] = amd_bytealign_S (w[19], w[20], offset); w[49] = amd_bytealign_S (w[18], w[19], offset); w[48] = amd_bytealign_S (w[17], w[18], offset); w[47] = amd_bytealign_S (w[16], w[17], offset); w[46] = amd_bytealign_S (w[15], w[16], offset); w[45] = amd_bytealign_S (w[14], w[15], offset); w[44] = amd_bytealign_S (w[13], w[14], offset); w[43] = amd_bytealign_S (w[12], w[13], offset); w[42] = amd_bytealign_S (w[11], w[12], offset); w[41] = amd_bytealign_S (w[10], w[11], offset); w[40] = amd_bytealign_S (w[ 9], w[10], offset); w[39] = amd_bytealign_S (w[ 8], w[ 9], offset); w[38] = amd_bytealign_S (w[ 7], w[ 8], offset); w[37] = amd_bytealign_S (w[ 6], w[ 7], offset); w[36] = amd_bytealign_S (w[ 5], w[ 6], offset); w[35] = amd_bytealign_S (w[ 4], w[ 5], offset); w[34] = amd_bytealign_S (w[ 3], w[ 4], offset); w[33] = amd_bytealign_S (w[ 2], w[ 3], offset); w[32] = amd_bytealign_S (w[ 1], w[ 2], offset); w[31] = amd_bytealign_S (w[ 0], w[ 1], offset); w[30] = amd_bytealign_S ( 0, w[ 0], offset); w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 31: w[63] = amd_bytealign_S (w[31], w[32], offset); w[62] = amd_bytealign_S (w[30], w[31], offset); w[61] = amd_bytealign_S (w[29], w[30], offset); w[60] = amd_bytealign_S (w[28], w[29], offset); w[59] = amd_bytealign_S (w[27], w[28], offset); w[58] = amd_bytealign_S (w[26], w[27], offset); w[57] = amd_bytealign_S (w[25], w[26], offset); w[56] = amd_bytealign_S (w[24], w[25], offset); w[55] = amd_bytealign_S (w[23], w[24], offset); w[54] = amd_bytealign_S (w[22], w[23], offset); w[53] = amd_bytealign_S (w[21], w[22], offset); w[52] = amd_bytealign_S (w[20], w[21], offset); w[51] = amd_bytealign_S (w[19], w[20], offset); w[50] = amd_bytealign_S (w[18], w[19], offset); w[49] = amd_bytealign_S (w[17], w[18], offset); w[48] = amd_bytealign_S (w[16], w[17], offset); w[47] = amd_bytealign_S (w[15], w[16], offset); w[46] = amd_bytealign_S (w[14], w[15], offset); w[45] = amd_bytealign_S (w[13], w[14], offset); w[44] = amd_bytealign_S (w[12], w[13], offset); w[43] = amd_bytealign_S (w[11], w[12], offset); w[42] = amd_bytealign_S (w[10], w[11], offset); w[41] = amd_bytealign_S (w[ 9], w[10], offset); w[40] = amd_bytealign_S (w[ 8], w[ 9], offset); w[39] = amd_bytealign_S (w[ 7], w[ 8], offset); w[38] = amd_bytealign_S (w[ 6], w[ 7], offset); w[37] = amd_bytealign_S (w[ 5], w[ 6], offset); w[36] = amd_bytealign_S (w[ 4], w[ 5], offset); w[35] = amd_bytealign_S (w[ 3], w[ 4], offset); w[34] = amd_bytealign_S (w[ 2], w[ 3], offset); w[33] = amd_bytealign_S (w[ 1], w[ 2], offset); w[32] = amd_bytealign_S (w[ 0], w[ 1], offset); w[31] = amd_bytealign_S ( 0, w[ 0], offset); w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 32: w[63] = amd_bytealign_S (w[30], w[31], offset); w[62] = amd_bytealign_S (w[29], w[30], offset); w[61] = amd_bytealign_S (w[28], w[29], offset); w[60] = amd_bytealign_S (w[27], w[28], offset); w[59] = amd_bytealign_S (w[26], w[27], offset); w[58] = amd_bytealign_S (w[25], w[26], offset); w[57] = amd_bytealign_S (w[24], w[25], offset); w[56] = amd_bytealign_S (w[23], w[24], offset); w[55] = amd_bytealign_S (w[22], w[23], offset); w[54] = amd_bytealign_S (w[21], w[22], offset); w[53] = amd_bytealign_S (w[20], w[21], offset); w[52] = amd_bytealign_S (w[19], w[20], offset); w[51] = amd_bytealign_S (w[18], w[19], offset); w[50] = amd_bytealign_S (w[17], w[18], offset); w[49] = amd_bytealign_S (w[16], w[17], offset); w[48] = amd_bytealign_S (w[15], w[16], offset); w[47] = amd_bytealign_S (w[14], w[15], offset); w[46] = amd_bytealign_S (w[13], w[14], offset); w[45] = amd_bytealign_S (w[12], w[13], offset); w[44] = amd_bytealign_S (w[11], w[12], offset); w[43] = amd_bytealign_S (w[10], w[11], offset); w[42] = amd_bytealign_S (w[ 9], w[10], offset); w[41] = amd_bytealign_S (w[ 8], w[ 9], offset); w[40] = amd_bytealign_S (w[ 7], w[ 8], offset); w[39] = amd_bytealign_S (w[ 6], w[ 7], offset); w[38] = amd_bytealign_S (w[ 5], w[ 6], offset); w[37] = amd_bytealign_S (w[ 4], w[ 5], offset); w[36] = amd_bytealign_S (w[ 3], w[ 4], offset); w[35] = amd_bytealign_S (w[ 2], w[ 3], offset); w[34] = amd_bytealign_S (w[ 1], w[ 2], offset); w[33] = amd_bytealign_S (w[ 0], w[ 1], offset); w[32] = amd_bytealign_S ( 0, w[ 0], offset); w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 33: w[63] = amd_bytealign_S (w[29], w[30], offset); w[62] = amd_bytealign_S (w[28], w[29], offset); w[61] = amd_bytealign_S (w[27], w[28], offset); w[60] = amd_bytealign_S (w[26], w[27], offset); w[59] = amd_bytealign_S (w[25], w[26], offset); w[58] = amd_bytealign_S (w[24], w[25], offset); w[57] = amd_bytealign_S (w[23], w[24], offset); w[56] = amd_bytealign_S (w[22], w[23], offset); w[55] = amd_bytealign_S (w[21], w[22], offset); w[54] = amd_bytealign_S (w[20], w[21], offset); w[53] = amd_bytealign_S (w[19], w[20], offset); w[52] = amd_bytealign_S (w[18], w[19], offset); w[51] = amd_bytealign_S (w[17], w[18], offset); w[50] = amd_bytealign_S (w[16], w[17], offset); w[49] = amd_bytealign_S (w[15], w[16], offset); w[48] = amd_bytealign_S (w[14], w[15], offset); w[47] = amd_bytealign_S (w[13], w[14], offset); w[46] = amd_bytealign_S (w[12], w[13], offset); w[45] = amd_bytealign_S (w[11], w[12], offset); w[44] = amd_bytealign_S (w[10], w[11], offset); w[43] = amd_bytealign_S (w[ 9], w[10], offset); w[42] = amd_bytealign_S (w[ 8], w[ 9], offset); w[41] = amd_bytealign_S (w[ 7], w[ 8], offset); w[40] = amd_bytealign_S (w[ 6], w[ 7], offset); w[39] = amd_bytealign_S (w[ 5], w[ 6], offset); w[38] = amd_bytealign_S (w[ 4], w[ 5], offset); w[37] = amd_bytealign_S (w[ 3], w[ 4], offset); w[36] = amd_bytealign_S (w[ 2], w[ 3], offset); w[35] = amd_bytealign_S (w[ 1], w[ 2], offset); w[34] = amd_bytealign_S (w[ 0], w[ 1], offset); w[33] = amd_bytealign_S ( 0, w[ 0], offset); w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 34: w[63] = amd_bytealign_S (w[28], w[29], offset); w[62] = amd_bytealign_S (w[27], w[28], offset); w[61] = amd_bytealign_S (w[26], w[27], offset); w[60] = amd_bytealign_S (w[25], w[26], offset); w[59] = amd_bytealign_S (w[24], w[25], offset); w[58] = amd_bytealign_S (w[23], w[24], offset); w[57] = amd_bytealign_S (w[22], w[23], offset); w[56] = amd_bytealign_S (w[21], w[22], offset); w[55] = amd_bytealign_S (w[20], w[21], offset); w[54] = amd_bytealign_S (w[19], w[20], offset); w[53] = amd_bytealign_S (w[18], w[19], offset); w[52] = amd_bytealign_S (w[17], w[18], offset); w[51] = amd_bytealign_S (w[16], w[17], offset); w[50] = amd_bytealign_S (w[15], w[16], offset); w[49] = amd_bytealign_S (w[14], w[15], offset); w[48] = amd_bytealign_S (w[13], w[14], offset); w[47] = amd_bytealign_S (w[12], w[13], offset); w[46] = amd_bytealign_S (w[11], w[12], offset); w[45] = amd_bytealign_S (w[10], w[11], offset); w[44] = amd_bytealign_S (w[ 9], w[10], offset); w[43] = amd_bytealign_S (w[ 8], w[ 9], offset); w[42] = amd_bytealign_S (w[ 7], w[ 8], offset); w[41] = amd_bytealign_S (w[ 6], w[ 7], offset); w[40] = amd_bytealign_S (w[ 5], w[ 6], offset); w[39] = amd_bytealign_S (w[ 4], w[ 5], offset); w[38] = amd_bytealign_S (w[ 3], w[ 4], offset); w[37] = amd_bytealign_S (w[ 2], w[ 3], offset); w[36] = amd_bytealign_S (w[ 1], w[ 2], offset); w[35] = amd_bytealign_S (w[ 0], w[ 1], offset); w[34] = amd_bytealign_S ( 0, w[ 0], offset); w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 35: w[63] = amd_bytealign_S (w[27], w[28], offset); w[62] = amd_bytealign_S (w[26], w[27], offset); w[61] = amd_bytealign_S (w[25], w[26], offset); w[60] = amd_bytealign_S (w[24], w[25], offset); w[59] = amd_bytealign_S (w[23], w[24], offset); w[58] = amd_bytealign_S (w[22], w[23], offset); w[57] = amd_bytealign_S (w[21], w[22], offset); w[56] = amd_bytealign_S (w[20], w[21], offset); w[55] = amd_bytealign_S (w[19], w[20], offset); w[54] = amd_bytealign_S (w[18], w[19], offset); w[53] = amd_bytealign_S (w[17], w[18], offset); w[52] = amd_bytealign_S (w[16], w[17], offset); w[51] = amd_bytealign_S (w[15], w[16], offset); w[50] = amd_bytealign_S (w[14], w[15], offset); w[49] = amd_bytealign_S (w[13], w[14], offset); w[48] = amd_bytealign_S (w[12], w[13], offset); w[47] = amd_bytealign_S (w[11], w[12], offset); w[46] = amd_bytealign_S (w[10], w[11], offset); w[45] = amd_bytealign_S (w[ 9], w[10], offset); w[44] = amd_bytealign_S (w[ 8], w[ 9], offset); w[43] = amd_bytealign_S (w[ 7], w[ 8], offset); w[42] = amd_bytealign_S (w[ 6], w[ 7], offset); w[41] = amd_bytealign_S (w[ 5], w[ 6], offset); w[40] = amd_bytealign_S (w[ 4], w[ 5], offset); w[39] = amd_bytealign_S (w[ 3], w[ 4], offset); w[38] = amd_bytealign_S (w[ 2], w[ 3], offset); w[37] = amd_bytealign_S (w[ 1], w[ 2], offset); w[36] = amd_bytealign_S (w[ 0], w[ 1], offset); w[35] = amd_bytealign_S ( 0, w[ 0], offset); w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 36: w[63] = amd_bytealign_S (w[26], w[27], offset); w[62] = amd_bytealign_S (w[25], w[26], offset); w[61] = amd_bytealign_S (w[24], w[25], offset); w[60] = amd_bytealign_S (w[23], w[24], offset); w[59] = amd_bytealign_S (w[22], w[23], offset); w[58] = amd_bytealign_S (w[21], w[22], offset); w[57] = amd_bytealign_S (w[20], w[21], offset); w[56] = amd_bytealign_S (w[19], w[20], offset); w[55] = amd_bytealign_S (w[18], w[19], offset); w[54] = amd_bytealign_S (w[17], w[18], offset); w[53] = amd_bytealign_S (w[16], w[17], offset); w[52] = amd_bytealign_S (w[15], w[16], offset); w[51] = amd_bytealign_S (w[14], w[15], offset); w[50] = amd_bytealign_S (w[13], w[14], offset); w[49] = amd_bytealign_S (w[12], w[13], offset); w[48] = amd_bytealign_S (w[11], w[12], offset); w[47] = amd_bytealign_S (w[10], w[11], offset); w[46] = amd_bytealign_S (w[ 9], w[10], offset); w[45] = amd_bytealign_S (w[ 8], w[ 9], offset); w[44] = amd_bytealign_S (w[ 7], w[ 8], offset); w[43] = amd_bytealign_S (w[ 6], w[ 7], offset); w[42] = amd_bytealign_S (w[ 5], w[ 6], offset); w[41] = amd_bytealign_S (w[ 4], w[ 5], offset); w[40] = amd_bytealign_S (w[ 3], w[ 4], offset); w[39] = amd_bytealign_S (w[ 2], w[ 3], offset); w[38] = amd_bytealign_S (w[ 1], w[ 2], offset); w[37] = amd_bytealign_S (w[ 0], w[ 1], offset); w[36] = amd_bytealign_S ( 0, w[ 0], offset); w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 37: w[63] = amd_bytealign_S (w[25], w[26], offset); w[62] = amd_bytealign_S (w[24], w[25], offset); w[61] = amd_bytealign_S (w[23], w[24], offset); w[60] = amd_bytealign_S (w[22], w[23], offset); w[59] = amd_bytealign_S (w[21], w[22], offset); w[58] = amd_bytealign_S (w[20], w[21], offset); w[57] = amd_bytealign_S (w[19], w[20], offset); w[56] = amd_bytealign_S (w[18], w[19], offset); w[55] = amd_bytealign_S (w[17], w[18], offset); w[54] = amd_bytealign_S (w[16], w[17], offset); w[53] = amd_bytealign_S (w[15], w[16], offset); w[52] = amd_bytealign_S (w[14], w[15], offset); w[51] = amd_bytealign_S (w[13], w[14], offset); w[50] = amd_bytealign_S (w[12], w[13], offset); w[49] = amd_bytealign_S (w[11], w[12], offset); w[48] = amd_bytealign_S (w[10], w[11], offset); w[47] = amd_bytealign_S (w[ 9], w[10], offset); w[46] = amd_bytealign_S (w[ 8], w[ 9], offset); w[45] = amd_bytealign_S (w[ 7], w[ 8], offset); w[44] = amd_bytealign_S (w[ 6], w[ 7], offset); w[43] = amd_bytealign_S (w[ 5], w[ 6], offset); w[42] = amd_bytealign_S (w[ 4], w[ 5], offset); w[41] = amd_bytealign_S (w[ 3], w[ 4], offset); w[40] = amd_bytealign_S (w[ 2], w[ 3], offset); w[39] = amd_bytealign_S (w[ 1], w[ 2], offset); w[38] = amd_bytealign_S (w[ 0], w[ 1], offset); w[37] = amd_bytealign_S ( 0, w[ 0], offset); w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 38: w[63] = amd_bytealign_S (w[24], w[25], offset); w[62] = amd_bytealign_S (w[23], w[24], offset); w[61] = amd_bytealign_S (w[22], w[23], offset); w[60] = amd_bytealign_S (w[21], w[22], offset); w[59] = amd_bytealign_S (w[20], w[21], offset); w[58] = amd_bytealign_S (w[19], w[20], offset); w[57] = amd_bytealign_S (w[18], w[19], offset); w[56] = amd_bytealign_S (w[17], w[18], offset); w[55] = amd_bytealign_S (w[16], w[17], offset); w[54] = amd_bytealign_S (w[15], w[16], offset); w[53] = amd_bytealign_S (w[14], w[15], offset); w[52] = amd_bytealign_S (w[13], w[14], offset); w[51] = amd_bytealign_S (w[12], w[13], offset); w[50] = amd_bytealign_S (w[11], w[12], offset); w[49] = amd_bytealign_S (w[10], w[11], offset); w[48] = amd_bytealign_S (w[ 9], w[10], offset); w[47] = amd_bytealign_S (w[ 8], w[ 9], offset); w[46] = amd_bytealign_S (w[ 7], w[ 8], offset); w[45] = amd_bytealign_S (w[ 6], w[ 7], offset); w[44] = amd_bytealign_S (w[ 5], w[ 6], offset); w[43] = amd_bytealign_S (w[ 4], w[ 5], offset); w[42] = amd_bytealign_S (w[ 3], w[ 4], offset); w[41] = amd_bytealign_S (w[ 2], w[ 3], offset); w[40] = amd_bytealign_S (w[ 1], w[ 2], offset); w[39] = amd_bytealign_S (w[ 0], w[ 1], offset); w[38] = amd_bytealign_S ( 0, w[ 0], offset); w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 39: w[63] = amd_bytealign_S (w[23], w[24], offset); w[62] = amd_bytealign_S (w[22], w[23], offset); w[61] = amd_bytealign_S (w[21], w[22], offset); w[60] = amd_bytealign_S (w[20], w[21], offset); w[59] = amd_bytealign_S (w[19], w[20], offset); w[58] = amd_bytealign_S (w[18], w[19], offset); w[57] = amd_bytealign_S (w[17], w[18], offset); w[56] = amd_bytealign_S (w[16], w[17], offset); w[55] = amd_bytealign_S (w[15], w[16], offset); w[54] = amd_bytealign_S (w[14], w[15], offset); w[53] = amd_bytealign_S (w[13], w[14], offset); w[52] = amd_bytealign_S (w[12], w[13], offset); w[51] = amd_bytealign_S (w[11], w[12], offset); w[50] = amd_bytealign_S (w[10], w[11], offset); w[49] = amd_bytealign_S (w[ 9], w[10], offset); w[48] = amd_bytealign_S (w[ 8], w[ 9], offset); w[47] = amd_bytealign_S (w[ 7], w[ 8], offset); w[46] = amd_bytealign_S (w[ 6], w[ 7], offset); w[45] = amd_bytealign_S (w[ 5], w[ 6], offset); w[44] = amd_bytealign_S (w[ 4], w[ 5], offset); w[43] = amd_bytealign_S (w[ 3], w[ 4], offset); w[42] = amd_bytealign_S (w[ 2], w[ 3], offset); w[41] = amd_bytealign_S (w[ 1], w[ 2], offset); w[40] = amd_bytealign_S (w[ 0], w[ 1], offset); w[39] = amd_bytealign_S ( 0, w[ 0], offset); w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 40: w[63] = amd_bytealign_S (w[22], w[23], offset); w[62] = amd_bytealign_S (w[21], w[22], offset); w[61] = amd_bytealign_S (w[20], w[21], offset); w[60] = amd_bytealign_S (w[19], w[20], offset); w[59] = amd_bytealign_S (w[18], w[19], offset); w[58] = amd_bytealign_S (w[17], w[18], offset); w[57] = amd_bytealign_S (w[16], w[17], offset); w[56] = amd_bytealign_S (w[15], w[16], offset); w[55] = amd_bytealign_S (w[14], w[15], offset); w[54] = amd_bytealign_S (w[13], w[14], offset); w[53] = amd_bytealign_S (w[12], w[13], offset); w[52] = amd_bytealign_S (w[11], w[12], offset); w[51] = amd_bytealign_S (w[10], w[11], offset); w[50] = amd_bytealign_S (w[ 9], w[10], offset); w[49] = amd_bytealign_S (w[ 8], w[ 9], offset); w[48] = amd_bytealign_S (w[ 7], w[ 8], offset); w[47] = amd_bytealign_S (w[ 6], w[ 7], offset); w[46] = amd_bytealign_S (w[ 5], w[ 6], offset); w[45] = amd_bytealign_S (w[ 4], w[ 5], offset); w[44] = amd_bytealign_S (w[ 3], w[ 4], offset); w[43] = amd_bytealign_S (w[ 2], w[ 3], offset); w[42] = amd_bytealign_S (w[ 1], w[ 2], offset); w[41] = amd_bytealign_S (w[ 0], w[ 1], offset); w[40] = amd_bytealign_S ( 0, w[ 0], offset); w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 41: w[63] = amd_bytealign_S (w[21], w[22], offset); w[62] = amd_bytealign_S (w[20], w[21], offset); w[61] = amd_bytealign_S (w[19], w[20], offset); w[60] = amd_bytealign_S (w[18], w[19], offset); w[59] = amd_bytealign_S (w[17], w[18], offset); w[58] = amd_bytealign_S (w[16], w[17], offset); w[57] = amd_bytealign_S (w[15], w[16], offset); w[56] = amd_bytealign_S (w[14], w[15], offset); w[55] = amd_bytealign_S (w[13], w[14], offset); w[54] = amd_bytealign_S (w[12], w[13], offset); w[53] = amd_bytealign_S (w[11], w[12], offset); w[52] = amd_bytealign_S (w[10], w[11], offset); w[51] = amd_bytealign_S (w[ 9], w[10], offset); w[50] = amd_bytealign_S (w[ 8], w[ 9], offset); w[49] = amd_bytealign_S (w[ 7], w[ 8], offset); w[48] = amd_bytealign_S (w[ 6], w[ 7], offset); w[47] = amd_bytealign_S (w[ 5], w[ 6], offset); w[46] = amd_bytealign_S (w[ 4], w[ 5], offset); w[45] = amd_bytealign_S (w[ 3], w[ 4], offset); w[44] = amd_bytealign_S (w[ 2], w[ 3], offset); w[43] = amd_bytealign_S (w[ 1], w[ 2], offset); w[42] = amd_bytealign_S (w[ 0], w[ 1], offset); w[41] = amd_bytealign_S ( 0, w[ 0], offset); w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 42: w[63] = amd_bytealign_S (w[20], w[21], offset); w[62] = amd_bytealign_S (w[19], w[20], offset); w[61] = amd_bytealign_S (w[18], w[19], offset); w[60] = amd_bytealign_S (w[17], w[18], offset); w[59] = amd_bytealign_S (w[16], w[17], offset); w[58] = amd_bytealign_S (w[15], w[16], offset); w[57] = amd_bytealign_S (w[14], w[15], offset); w[56] = amd_bytealign_S (w[13], w[14], offset); w[55] = amd_bytealign_S (w[12], w[13], offset); w[54] = amd_bytealign_S (w[11], w[12], offset); w[53] = amd_bytealign_S (w[10], w[11], offset); w[52] = amd_bytealign_S (w[ 9], w[10], offset); w[51] = amd_bytealign_S (w[ 8], w[ 9], offset); w[50] = amd_bytealign_S (w[ 7], w[ 8], offset); w[49] = amd_bytealign_S (w[ 6], w[ 7], offset); w[48] = amd_bytealign_S (w[ 5], w[ 6], offset); w[47] = amd_bytealign_S (w[ 4], w[ 5], offset); w[46] = amd_bytealign_S (w[ 3], w[ 4], offset); w[45] = amd_bytealign_S (w[ 2], w[ 3], offset); w[44] = amd_bytealign_S (w[ 1], w[ 2], offset); w[43] = amd_bytealign_S (w[ 0], w[ 1], offset); w[42] = amd_bytealign_S ( 0, w[ 0], offset); w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 43: w[63] = amd_bytealign_S (w[19], w[20], offset); w[62] = amd_bytealign_S (w[18], w[19], offset); w[61] = amd_bytealign_S (w[17], w[18], offset); w[60] = amd_bytealign_S (w[16], w[17], offset); w[59] = amd_bytealign_S (w[15], w[16], offset); w[58] = amd_bytealign_S (w[14], w[15], offset); w[57] = amd_bytealign_S (w[13], w[14], offset); w[56] = amd_bytealign_S (w[12], w[13], offset); w[55] = amd_bytealign_S (w[11], w[12], offset); w[54] = amd_bytealign_S (w[10], w[11], offset); w[53] = amd_bytealign_S (w[ 9], w[10], offset); w[52] = amd_bytealign_S (w[ 8], w[ 9], offset); w[51] = amd_bytealign_S (w[ 7], w[ 8], offset); w[50] = amd_bytealign_S (w[ 6], w[ 7], offset); w[49] = amd_bytealign_S (w[ 5], w[ 6], offset); w[48] = amd_bytealign_S (w[ 4], w[ 5], offset); w[47] = amd_bytealign_S (w[ 3], w[ 4], offset); w[46] = amd_bytealign_S (w[ 2], w[ 3], offset); w[45] = amd_bytealign_S (w[ 1], w[ 2], offset); w[44] = amd_bytealign_S (w[ 0], w[ 1], offset); w[43] = amd_bytealign_S ( 0, w[ 0], offset); w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 44: w[63] = amd_bytealign_S (w[18], w[19], offset); w[62] = amd_bytealign_S (w[17], w[18], offset); w[61] = amd_bytealign_S (w[16], w[17], offset); w[60] = amd_bytealign_S (w[15], w[16], offset); w[59] = amd_bytealign_S (w[14], w[15], offset); w[58] = amd_bytealign_S (w[13], w[14], offset); w[57] = amd_bytealign_S (w[12], w[13], offset); w[56] = amd_bytealign_S (w[11], w[12], offset); w[55] = amd_bytealign_S (w[10], w[11], offset); w[54] = amd_bytealign_S (w[ 9], w[10], offset); w[53] = amd_bytealign_S (w[ 8], w[ 9], offset); w[52] = amd_bytealign_S (w[ 7], w[ 8], offset); w[51] = amd_bytealign_S (w[ 6], w[ 7], offset); w[50] = amd_bytealign_S (w[ 5], w[ 6], offset); w[49] = amd_bytealign_S (w[ 4], w[ 5], offset); w[48] = amd_bytealign_S (w[ 3], w[ 4], offset); w[47] = amd_bytealign_S (w[ 2], w[ 3], offset); w[46] = amd_bytealign_S (w[ 1], w[ 2], offset); w[45] = amd_bytealign_S (w[ 0], w[ 1], offset); w[44] = amd_bytealign_S ( 0, w[ 0], offset); w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 45: w[63] = amd_bytealign_S (w[17], w[18], offset); w[62] = amd_bytealign_S (w[16], w[17], offset); w[61] = amd_bytealign_S (w[15], w[16], offset); w[60] = amd_bytealign_S (w[14], w[15], offset); w[59] = amd_bytealign_S (w[13], w[14], offset); w[58] = amd_bytealign_S (w[12], w[13], offset); w[57] = amd_bytealign_S (w[11], w[12], offset); w[56] = amd_bytealign_S (w[10], w[11], offset); w[55] = amd_bytealign_S (w[ 9], w[10], offset); w[54] = amd_bytealign_S (w[ 8], w[ 9], offset); w[53] = amd_bytealign_S (w[ 7], w[ 8], offset); w[52] = amd_bytealign_S (w[ 6], w[ 7], offset); w[51] = amd_bytealign_S (w[ 5], w[ 6], offset); w[50] = amd_bytealign_S (w[ 4], w[ 5], offset); w[49] = amd_bytealign_S (w[ 3], w[ 4], offset); w[48] = amd_bytealign_S (w[ 2], w[ 3], offset); w[47] = amd_bytealign_S (w[ 1], w[ 2], offset); w[46] = amd_bytealign_S (w[ 0], w[ 1], offset); w[45] = amd_bytealign_S ( 0, w[ 0], offset); w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 46: w[63] = amd_bytealign_S (w[16], w[17], offset); w[62] = amd_bytealign_S (w[15], w[16], offset); w[61] = amd_bytealign_S (w[14], w[15], offset); w[60] = amd_bytealign_S (w[13], w[14], offset); w[59] = amd_bytealign_S (w[12], w[13], offset); w[58] = amd_bytealign_S (w[11], w[12], offset); w[57] = amd_bytealign_S (w[10], w[11], offset); w[56] = amd_bytealign_S (w[ 9], w[10], offset); w[55] = amd_bytealign_S (w[ 8], w[ 9], offset); w[54] = amd_bytealign_S (w[ 7], w[ 8], offset); w[53] = amd_bytealign_S (w[ 6], w[ 7], offset); w[52] = amd_bytealign_S (w[ 5], w[ 6], offset); w[51] = amd_bytealign_S (w[ 4], w[ 5], offset); w[50] = amd_bytealign_S (w[ 3], w[ 4], offset); w[49] = amd_bytealign_S (w[ 2], w[ 3], offset); w[48] = amd_bytealign_S (w[ 1], w[ 2], offset); w[47] = amd_bytealign_S (w[ 0], w[ 1], offset); w[46] = amd_bytealign_S ( 0, w[ 0], offset); w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 47: w[63] = amd_bytealign_S (w[15], w[16], offset); w[62] = amd_bytealign_S (w[14], w[15], offset); w[61] = amd_bytealign_S (w[13], w[14], offset); w[60] = amd_bytealign_S (w[12], w[13], offset); w[59] = amd_bytealign_S (w[11], w[12], offset); w[58] = amd_bytealign_S (w[10], w[11], offset); w[57] = amd_bytealign_S (w[ 9], w[10], offset); w[56] = amd_bytealign_S (w[ 8], w[ 9], offset); w[55] = amd_bytealign_S (w[ 7], w[ 8], offset); w[54] = amd_bytealign_S (w[ 6], w[ 7], offset); w[53] = amd_bytealign_S (w[ 5], w[ 6], offset); w[52] = amd_bytealign_S (w[ 4], w[ 5], offset); w[51] = amd_bytealign_S (w[ 3], w[ 4], offset); w[50] = amd_bytealign_S (w[ 2], w[ 3], offset); w[49] = amd_bytealign_S (w[ 1], w[ 2], offset); w[48] = amd_bytealign_S (w[ 0], w[ 1], offset); w[47] = amd_bytealign_S ( 0, w[ 0], offset); w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 48: w[63] = amd_bytealign_S (w[14], w[15], offset); w[62] = amd_bytealign_S (w[13], w[14], offset); w[61] = amd_bytealign_S (w[12], w[13], offset); w[60] = amd_bytealign_S (w[11], w[12], offset); w[59] = amd_bytealign_S (w[10], w[11], offset); w[58] = amd_bytealign_S (w[ 9], w[10], offset); w[57] = amd_bytealign_S (w[ 8], w[ 9], offset); w[56] = amd_bytealign_S (w[ 7], w[ 8], offset); w[55] = amd_bytealign_S (w[ 6], w[ 7], offset); w[54] = amd_bytealign_S (w[ 5], w[ 6], offset); w[53] = amd_bytealign_S (w[ 4], w[ 5], offset); w[52] = amd_bytealign_S (w[ 3], w[ 4], offset); w[51] = amd_bytealign_S (w[ 2], w[ 3], offset); w[50] = amd_bytealign_S (w[ 1], w[ 2], offset); w[49] = amd_bytealign_S (w[ 0], w[ 1], offset); w[48] = amd_bytealign_S ( 0, w[ 0], offset); w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 49: w[63] = amd_bytealign_S (w[13], w[14], offset); w[62] = amd_bytealign_S (w[12], w[13], offset); w[61] = amd_bytealign_S (w[11], w[12], offset); w[60] = amd_bytealign_S (w[10], w[11], offset); w[59] = amd_bytealign_S (w[ 9], w[10], offset); w[58] = amd_bytealign_S (w[ 8], w[ 9], offset); w[57] = amd_bytealign_S (w[ 7], w[ 8], offset); w[56] = amd_bytealign_S (w[ 6], w[ 7], offset); w[55] = amd_bytealign_S (w[ 5], w[ 6], offset); w[54] = amd_bytealign_S (w[ 4], w[ 5], offset); w[53] = amd_bytealign_S (w[ 3], w[ 4], offset); w[52] = amd_bytealign_S (w[ 2], w[ 3], offset); w[51] = amd_bytealign_S (w[ 1], w[ 2], offset); w[50] = amd_bytealign_S (w[ 0], w[ 1], offset); w[49] = amd_bytealign_S ( 0, w[ 0], offset); w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 50: w[63] = amd_bytealign_S (w[12], w[13], offset); w[62] = amd_bytealign_S (w[11], w[12], offset); w[61] = amd_bytealign_S (w[10], w[11], offset); w[60] = amd_bytealign_S (w[ 9], w[10], offset); w[59] = amd_bytealign_S (w[ 8], w[ 9], offset); w[58] = amd_bytealign_S (w[ 7], w[ 8], offset); w[57] = amd_bytealign_S (w[ 6], w[ 7], offset); w[56] = amd_bytealign_S (w[ 5], w[ 6], offset); w[55] = amd_bytealign_S (w[ 4], w[ 5], offset); w[54] = amd_bytealign_S (w[ 3], w[ 4], offset); w[53] = amd_bytealign_S (w[ 2], w[ 3], offset); w[52] = amd_bytealign_S (w[ 1], w[ 2], offset); w[51] = amd_bytealign_S (w[ 0], w[ 1], offset); w[50] = amd_bytealign_S ( 0, w[ 0], offset); w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 51: w[63] = amd_bytealign_S (w[11], w[12], offset); w[62] = amd_bytealign_S (w[10], w[11], offset); w[61] = amd_bytealign_S (w[ 9], w[10], offset); w[60] = amd_bytealign_S (w[ 8], w[ 9], offset); w[59] = amd_bytealign_S (w[ 7], w[ 8], offset); w[58] = amd_bytealign_S (w[ 6], w[ 7], offset); w[57] = amd_bytealign_S (w[ 5], w[ 6], offset); w[56] = amd_bytealign_S (w[ 4], w[ 5], offset); w[55] = amd_bytealign_S (w[ 3], w[ 4], offset); w[54] = amd_bytealign_S (w[ 2], w[ 3], offset); w[53] = amd_bytealign_S (w[ 1], w[ 2], offset); w[52] = amd_bytealign_S (w[ 0], w[ 1], offset); w[51] = amd_bytealign_S ( 0, w[ 0], offset); w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 52: w[63] = amd_bytealign_S (w[10], w[11], offset); w[62] = amd_bytealign_S (w[ 9], w[10], offset); w[61] = amd_bytealign_S (w[ 8], w[ 9], offset); w[60] = amd_bytealign_S (w[ 7], w[ 8], offset); w[59] = amd_bytealign_S (w[ 6], w[ 7], offset); w[58] = amd_bytealign_S (w[ 5], w[ 6], offset); w[57] = amd_bytealign_S (w[ 4], w[ 5], offset); w[56] = amd_bytealign_S (w[ 3], w[ 4], offset); w[55] = amd_bytealign_S (w[ 2], w[ 3], offset); w[54] = amd_bytealign_S (w[ 1], w[ 2], offset); w[53] = amd_bytealign_S (w[ 0], w[ 1], offset); w[52] = amd_bytealign_S ( 0, w[ 0], offset); w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 53: w[63] = amd_bytealign_S (w[ 9], w[10], offset); w[62] = amd_bytealign_S (w[ 8], w[ 9], offset); w[61] = amd_bytealign_S (w[ 7], w[ 8], offset); w[60] = amd_bytealign_S (w[ 6], w[ 7], offset); w[59] = amd_bytealign_S (w[ 5], w[ 6], offset); w[58] = amd_bytealign_S (w[ 4], w[ 5], offset); w[57] = amd_bytealign_S (w[ 3], w[ 4], offset); w[56] = amd_bytealign_S (w[ 2], w[ 3], offset); w[55] = amd_bytealign_S (w[ 1], w[ 2], offset); w[54] = amd_bytealign_S (w[ 0], w[ 1], offset); w[53] = amd_bytealign_S ( 0, w[ 0], offset); w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 54: w[63] = amd_bytealign_S (w[ 8], w[ 9], offset); w[62] = amd_bytealign_S (w[ 7], w[ 8], offset); w[61] = amd_bytealign_S (w[ 6], w[ 7], offset); w[60] = amd_bytealign_S (w[ 5], w[ 6], offset); w[59] = amd_bytealign_S (w[ 4], w[ 5], offset); w[58] = amd_bytealign_S (w[ 3], w[ 4], offset); w[57] = amd_bytealign_S (w[ 2], w[ 3], offset); w[56] = amd_bytealign_S (w[ 1], w[ 2], offset); w[55] = amd_bytealign_S (w[ 0], w[ 1], offset); w[54] = amd_bytealign_S ( 0, w[ 0], offset); w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 55: w[63] = amd_bytealign_S (w[ 7], w[ 8], offset); w[62] = amd_bytealign_S (w[ 6], w[ 7], offset); w[61] = amd_bytealign_S (w[ 5], w[ 6], offset); w[60] = amd_bytealign_S (w[ 4], w[ 5], offset); w[59] = amd_bytealign_S (w[ 3], w[ 4], offset); w[58] = amd_bytealign_S (w[ 2], w[ 3], offset); w[57] = amd_bytealign_S (w[ 1], w[ 2], offset); w[56] = amd_bytealign_S (w[ 0], w[ 1], offset); w[55] = amd_bytealign_S ( 0, w[ 0], offset); w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 56: w[63] = amd_bytealign_S (w[ 6], w[ 7], offset); w[62] = amd_bytealign_S (w[ 5], w[ 6], offset); w[61] = amd_bytealign_S (w[ 4], w[ 5], offset); w[60] = amd_bytealign_S (w[ 3], w[ 4], offset); w[59] = amd_bytealign_S (w[ 2], w[ 3], offset); w[58] = amd_bytealign_S (w[ 1], w[ 2], offset); w[57] = amd_bytealign_S (w[ 0], w[ 1], offset); w[56] = amd_bytealign_S ( 0, w[ 0], offset); w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 57: w[63] = amd_bytealign_S (w[ 5], w[ 6], offset); w[62] = amd_bytealign_S (w[ 4], w[ 5], offset); w[61] = amd_bytealign_S (w[ 3], w[ 4], offset); w[60] = amd_bytealign_S (w[ 2], w[ 3], offset); w[59] = amd_bytealign_S (w[ 1], w[ 2], offset); w[58] = amd_bytealign_S (w[ 0], w[ 1], offset); w[57] = amd_bytealign_S ( 0, w[ 0], offset); w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 58: w[63] = amd_bytealign_S (w[ 4], w[ 5], offset); w[62] = amd_bytealign_S (w[ 3], w[ 4], offset); w[61] = amd_bytealign_S (w[ 2], w[ 3], offset); w[60] = amd_bytealign_S (w[ 1], w[ 2], offset); w[59] = amd_bytealign_S (w[ 0], w[ 1], offset); w[58] = amd_bytealign_S ( 0, w[ 0], offset); w[57] = 0; w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 59: w[63] = amd_bytealign_S (w[ 3], w[ 4], offset); w[62] = amd_bytealign_S (w[ 2], w[ 3], offset); w[61] = amd_bytealign_S (w[ 1], w[ 2], offset); w[60] = amd_bytealign_S (w[ 0], w[ 1], offset); w[59] = amd_bytealign_S ( 0, w[ 0], offset); w[58] = 0; w[57] = 0; w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 60: w[63] = amd_bytealign_S (w[ 2], w[ 3], offset); w[62] = amd_bytealign_S (w[ 1], w[ 2], offset); w[61] = amd_bytealign_S (w[ 0], w[ 1], offset); w[60] = amd_bytealign_S ( 0, w[ 0], offset); w[59] = 0; w[58] = 0; w[57] = 0; w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 61: w[63] = amd_bytealign_S (w[ 1], w[ 2], offset); w[62] = amd_bytealign_S (w[ 0], w[ 1], offset); w[61] = amd_bytealign_S ( 0, w[ 0], offset); w[60] = 0; w[59] = 0; w[58] = 0; w[57] = 0; w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 62: w[63] = amd_bytealign_S (w[ 0], w[ 1], offset); w[62] = amd_bytealign_S ( 0, w[ 0], offset); w[61] = 0; w[60] = 0; w[59] = 0; w[58] = 0; w[57] = 0; w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 63: w[63] = amd_bytealign_S ( 0, w[ 0], offset); w[62] = 0; w[61] = 0; w[60] = 0; w[59] = 0; w[58] = 0; w[57] = 0; w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; } #endif #if (defined IS_AMD && AMD_GCN >= 3) || defined IS_NV #if defined IS_NV const int selector = (0x76543210 >> ((offset & 3) * 4)) & 0xffff; #endif #if defined IS_AMD const int selector = 0x0706050403020100 >> ((offset & 3) * 8); #endif switch (offset_switch) { case 0: w[63] = __byte_perm_S (w[63], w[62], selector); w[62] = __byte_perm_S (w[62], w[61], selector); w[61] = __byte_perm_S (w[61], w[60], selector); w[60] = __byte_perm_S (w[60], w[59], selector); w[59] = __byte_perm_S (w[59], w[58], selector); w[58] = __byte_perm_S (w[58], w[57], selector); w[57] = __byte_perm_S (w[57], w[56], selector); w[56] = __byte_perm_S (w[56], w[55], selector); w[55] = __byte_perm_S (w[55], w[54], selector); w[54] = __byte_perm_S (w[54], w[53], selector); w[53] = __byte_perm_S (w[53], w[52], selector); w[52] = __byte_perm_S (w[52], w[51], selector); w[51] = __byte_perm_S (w[51], w[50], selector); w[50] = __byte_perm_S (w[50], w[49], selector); w[49] = __byte_perm_S (w[49], w[48], selector); w[48] = __byte_perm_S (w[48], w[47], selector); w[47] = __byte_perm_S (w[47], w[46], selector); w[46] = __byte_perm_S (w[46], w[45], selector); w[45] = __byte_perm_S (w[45], w[44], selector); w[44] = __byte_perm_S (w[44], w[43], selector); w[43] = __byte_perm_S (w[43], w[42], selector); w[42] = __byte_perm_S (w[42], w[41], selector); w[41] = __byte_perm_S (w[41], w[40], selector); w[40] = __byte_perm_S (w[40], w[39], selector); w[39] = __byte_perm_S (w[39], w[38], selector); w[38] = __byte_perm_S (w[38], w[37], selector); w[37] = __byte_perm_S (w[37], w[36], selector); w[36] = __byte_perm_S (w[36], w[35], selector); w[35] = __byte_perm_S (w[35], w[34], selector); w[34] = __byte_perm_S (w[34], w[33], selector); w[33] = __byte_perm_S (w[33], w[32], selector); w[32] = __byte_perm_S (w[32], w[31], selector); w[31] = __byte_perm_S (w[31], w[30], selector); w[30] = __byte_perm_S (w[30], w[29], selector); w[29] = __byte_perm_S (w[29], w[28], selector); w[28] = __byte_perm_S (w[28], w[27], selector); w[27] = __byte_perm_S (w[27], w[26], selector); w[26] = __byte_perm_S (w[26], w[25], selector); w[25] = __byte_perm_S (w[25], w[24], selector); w[24] = __byte_perm_S (w[24], w[23], selector); w[23] = __byte_perm_S (w[23], w[22], selector); w[22] = __byte_perm_S (w[22], w[21], selector); w[21] = __byte_perm_S (w[21], w[20], selector); w[20] = __byte_perm_S (w[20], w[19], selector); w[19] = __byte_perm_S (w[19], w[18], selector); w[18] = __byte_perm_S (w[18], w[17], selector); w[17] = __byte_perm_S (w[17], w[16], selector); w[16] = __byte_perm_S (w[16], w[15], selector); w[15] = __byte_perm_S (w[15], w[14], selector); w[14] = __byte_perm_S (w[14], w[13], selector); w[13] = __byte_perm_S (w[13], w[12], selector); w[12] = __byte_perm_S (w[12], w[11], selector); w[11] = __byte_perm_S (w[11], w[10], selector); w[10] = __byte_perm_S (w[10], w[ 9], selector); w[ 9] = __byte_perm_S (w[ 9], w[ 8], selector); w[ 8] = __byte_perm_S (w[ 8], w[ 7], selector); w[ 7] = __byte_perm_S (w[ 7], w[ 6], selector); w[ 6] = __byte_perm_S (w[ 6], w[ 5], selector); w[ 5] = __byte_perm_S (w[ 5], w[ 4], selector); w[ 4] = __byte_perm_S (w[ 4], w[ 3], selector); w[ 3] = __byte_perm_S (w[ 3], w[ 2], selector); w[ 2] = __byte_perm_S (w[ 2], w[ 1], selector); w[ 1] = __byte_perm_S (w[ 1], w[ 0], selector); w[ 0] = __byte_perm_S (w[ 0], 0, selector); break; case 1: w[63] = __byte_perm_S (w[62], w[61], selector); w[62] = __byte_perm_S (w[61], w[60], selector); w[61] = __byte_perm_S (w[60], w[59], selector); w[60] = __byte_perm_S (w[59], w[58], selector); w[59] = __byte_perm_S (w[58], w[57], selector); w[58] = __byte_perm_S (w[57], w[56], selector); w[57] = __byte_perm_S (w[56], w[55], selector); w[56] = __byte_perm_S (w[55], w[54], selector); w[55] = __byte_perm_S (w[54], w[53], selector); w[54] = __byte_perm_S (w[53], w[52], selector); w[53] = __byte_perm_S (w[52], w[51], selector); w[52] = __byte_perm_S (w[51], w[50], selector); w[51] = __byte_perm_S (w[50], w[49], selector); w[50] = __byte_perm_S (w[49], w[48], selector); w[49] = __byte_perm_S (w[48], w[47], selector); w[48] = __byte_perm_S (w[47], w[46], selector); w[47] = __byte_perm_S (w[46], w[45], selector); w[46] = __byte_perm_S (w[45], w[44], selector); w[45] = __byte_perm_S (w[44], w[43], selector); w[44] = __byte_perm_S (w[43], w[42], selector); w[43] = __byte_perm_S (w[42], w[41], selector); w[42] = __byte_perm_S (w[41], w[40], selector); w[41] = __byte_perm_S (w[40], w[39], selector); w[40] = __byte_perm_S (w[39], w[38], selector); w[39] = __byte_perm_S (w[38], w[37], selector); w[38] = __byte_perm_S (w[37], w[36], selector); w[37] = __byte_perm_S (w[36], w[35], selector); w[36] = __byte_perm_S (w[35], w[34], selector); w[35] = __byte_perm_S (w[34], w[33], selector); w[34] = __byte_perm_S (w[33], w[32], selector); w[33] = __byte_perm_S (w[32], w[31], selector); w[32] = __byte_perm_S (w[31], w[30], selector); w[31] = __byte_perm_S (w[30], w[29], selector); w[30] = __byte_perm_S (w[29], w[28], selector); w[29] = __byte_perm_S (w[28], w[27], selector); w[28] = __byte_perm_S (w[27], w[26], selector); w[27] = __byte_perm_S (w[26], w[25], selector); w[26] = __byte_perm_S (w[25], w[24], selector); w[25] = __byte_perm_S (w[24], w[23], selector); w[24] = __byte_perm_S (w[23], w[22], selector); w[23] = __byte_perm_S (w[22], w[21], selector); w[22] = __byte_perm_S (w[21], w[20], selector); w[21] = __byte_perm_S (w[20], w[19], selector); w[20] = __byte_perm_S (w[19], w[18], selector); w[19] = __byte_perm_S (w[18], w[17], selector); w[18] = __byte_perm_S (w[17], w[16], selector); w[17] = __byte_perm_S (w[16], w[15], selector); w[16] = __byte_perm_S (w[15], w[14], selector); w[15] = __byte_perm_S (w[14], w[13], selector); w[14] = __byte_perm_S (w[13], w[12], selector); w[13] = __byte_perm_S (w[12], w[11], selector); w[12] = __byte_perm_S (w[11], w[10], selector); w[11] = __byte_perm_S (w[10], w[ 9], selector); w[10] = __byte_perm_S (w[ 9], w[ 8], selector); w[ 9] = __byte_perm_S (w[ 8], w[ 7], selector); w[ 8] = __byte_perm_S (w[ 7], w[ 6], selector); w[ 7] = __byte_perm_S (w[ 6], w[ 5], selector); w[ 6] = __byte_perm_S (w[ 5], w[ 4], selector); w[ 5] = __byte_perm_S (w[ 4], w[ 3], selector); w[ 4] = __byte_perm_S (w[ 3], w[ 2], selector); w[ 3] = __byte_perm_S (w[ 2], w[ 1], selector); w[ 2] = __byte_perm_S (w[ 1], w[ 0], selector); w[ 1] = __byte_perm_S (w[ 0], 0, selector); w[ 0] = 0; break; case 2: w[63] = __byte_perm_S (w[61], w[60], selector); w[62] = __byte_perm_S (w[60], w[59], selector); w[61] = __byte_perm_S (w[59], w[58], selector); w[60] = __byte_perm_S (w[58], w[57], selector); w[59] = __byte_perm_S (w[57], w[56], selector); w[58] = __byte_perm_S (w[56], w[55], selector); w[57] = __byte_perm_S (w[55], w[54], selector); w[56] = __byte_perm_S (w[54], w[53], selector); w[55] = __byte_perm_S (w[53], w[52], selector); w[54] = __byte_perm_S (w[52], w[51], selector); w[53] = __byte_perm_S (w[51], w[50], selector); w[52] = __byte_perm_S (w[50], w[49], selector); w[51] = __byte_perm_S (w[49], w[48], selector); w[50] = __byte_perm_S (w[48], w[47], selector); w[49] = __byte_perm_S (w[47], w[46], selector); w[48] = __byte_perm_S (w[46], w[45], selector); w[47] = __byte_perm_S (w[45], w[44], selector); w[46] = __byte_perm_S (w[44], w[43], selector); w[45] = __byte_perm_S (w[43], w[42], selector); w[44] = __byte_perm_S (w[42], w[41], selector); w[43] = __byte_perm_S (w[41], w[40], selector); w[42] = __byte_perm_S (w[40], w[39], selector); w[41] = __byte_perm_S (w[39], w[38], selector); w[40] = __byte_perm_S (w[38], w[37], selector); w[39] = __byte_perm_S (w[37], w[36], selector); w[38] = __byte_perm_S (w[36], w[35], selector); w[37] = __byte_perm_S (w[35], w[34], selector); w[36] = __byte_perm_S (w[34], w[33], selector); w[35] = __byte_perm_S (w[33], w[32], selector); w[34] = __byte_perm_S (w[32], w[31], selector); w[33] = __byte_perm_S (w[31], w[30], selector); w[32] = __byte_perm_S (w[30], w[29], selector); w[31] = __byte_perm_S (w[29], w[28], selector); w[30] = __byte_perm_S (w[28], w[27], selector); w[29] = __byte_perm_S (w[27], w[26], selector); w[28] = __byte_perm_S (w[26], w[25], selector); w[27] = __byte_perm_S (w[25], w[24], selector); w[26] = __byte_perm_S (w[24], w[23], selector); w[25] = __byte_perm_S (w[23], w[22], selector); w[24] = __byte_perm_S (w[22], w[21], selector); w[23] = __byte_perm_S (w[21], w[20], selector); w[22] = __byte_perm_S (w[20], w[19], selector); w[21] = __byte_perm_S (w[19], w[18], selector); w[20] = __byte_perm_S (w[18], w[17], selector); w[19] = __byte_perm_S (w[17], w[16], selector); w[18] = __byte_perm_S (w[16], w[15], selector); w[17] = __byte_perm_S (w[15], w[14], selector); w[16] = __byte_perm_S (w[14], w[13], selector); w[15] = __byte_perm_S (w[13], w[12], selector); w[14] = __byte_perm_S (w[12], w[11], selector); w[13] = __byte_perm_S (w[11], w[10], selector); w[12] = __byte_perm_S (w[10], w[ 9], selector); w[11] = __byte_perm_S (w[ 9], w[ 8], selector); w[10] = __byte_perm_S (w[ 8], w[ 7], selector); w[ 9] = __byte_perm_S (w[ 7], w[ 6], selector); w[ 8] = __byte_perm_S (w[ 6], w[ 5], selector); w[ 7] = __byte_perm_S (w[ 5], w[ 4], selector); w[ 6] = __byte_perm_S (w[ 4], w[ 3], selector); w[ 5] = __byte_perm_S (w[ 3], w[ 2], selector); w[ 4] = __byte_perm_S (w[ 2], w[ 1], selector); w[ 3] = __byte_perm_S (w[ 1], w[ 0], selector); w[ 2] = __byte_perm_S (w[ 0], 0, selector); w[ 1] = 0; w[ 0] = 0; break; case 3: w[63] = __byte_perm_S (w[60], w[59], selector); w[62] = __byte_perm_S (w[59], w[58], selector); w[61] = __byte_perm_S (w[58], w[57], selector); w[60] = __byte_perm_S (w[57], w[56], selector); w[59] = __byte_perm_S (w[56], w[55], selector); w[58] = __byte_perm_S (w[55], w[54], selector); w[57] = __byte_perm_S (w[54], w[53], selector); w[56] = __byte_perm_S (w[53], w[52], selector); w[55] = __byte_perm_S (w[52], w[51], selector); w[54] = __byte_perm_S (w[51], w[50], selector); w[53] = __byte_perm_S (w[50], w[49], selector); w[52] = __byte_perm_S (w[49], w[48], selector); w[51] = __byte_perm_S (w[48], w[47], selector); w[50] = __byte_perm_S (w[47], w[46], selector); w[49] = __byte_perm_S (w[46], w[45], selector); w[48] = __byte_perm_S (w[45], w[44], selector); w[47] = __byte_perm_S (w[44], w[43], selector); w[46] = __byte_perm_S (w[43], w[42], selector); w[45] = __byte_perm_S (w[42], w[41], selector); w[44] = __byte_perm_S (w[41], w[40], selector); w[43] = __byte_perm_S (w[40], w[39], selector); w[42] = __byte_perm_S (w[39], w[38], selector); w[41] = __byte_perm_S (w[38], w[37], selector); w[40] = __byte_perm_S (w[37], w[36], selector); w[39] = __byte_perm_S (w[36], w[35], selector); w[38] = __byte_perm_S (w[35], w[34], selector); w[37] = __byte_perm_S (w[34], w[33], selector); w[36] = __byte_perm_S (w[33], w[32], selector); w[35] = __byte_perm_S (w[32], w[31], selector); w[34] = __byte_perm_S (w[31], w[30], selector); w[33] = __byte_perm_S (w[30], w[29], selector); w[32] = __byte_perm_S (w[29], w[28], selector); w[31] = __byte_perm_S (w[28], w[27], selector); w[30] = __byte_perm_S (w[27], w[26], selector); w[29] = __byte_perm_S (w[26], w[25], selector); w[28] = __byte_perm_S (w[25], w[24], selector); w[27] = __byte_perm_S (w[24], w[23], selector); w[26] = __byte_perm_S (w[23], w[22], selector); w[25] = __byte_perm_S (w[22], w[21], selector); w[24] = __byte_perm_S (w[21], w[20], selector); w[23] = __byte_perm_S (w[20], w[19], selector); w[22] = __byte_perm_S (w[19], w[18], selector); w[21] = __byte_perm_S (w[18], w[17], selector); w[20] = __byte_perm_S (w[17], w[16], selector); w[19] = __byte_perm_S (w[16], w[15], selector); w[18] = __byte_perm_S (w[15], w[14], selector); w[17] = __byte_perm_S (w[14], w[13], selector); w[16] = __byte_perm_S (w[13], w[12], selector); w[15] = __byte_perm_S (w[12], w[11], selector); w[14] = __byte_perm_S (w[11], w[10], selector); w[13] = __byte_perm_S (w[10], w[ 9], selector); w[12] = __byte_perm_S (w[ 9], w[ 8], selector); w[11] = __byte_perm_S (w[ 8], w[ 7], selector); w[10] = __byte_perm_S (w[ 7], w[ 6], selector); w[ 9] = __byte_perm_S (w[ 6], w[ 5], selector); w[ 8] = __byte_perm_S (w[ 5], w[ 4], selector); w[ 7] = __byte_perm_S (w[ 4], w[ 3], selector); w[ 6] = __byte_perm_S (w[ 3], w[ 2], selector); w[ 5] = __byte_perm_S (w[ 2], w[ 1], selector); w[ 4] = __byte_perm_S (w[ 1], w[ 0], selector); w[ 3] = __byte_perm_S (w[ 0], 0, selector); w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 4: w[63] = __byte_perm_S (w[59], w[58], selector); w[62] = __byte_perm_S (w[58], w[57], selector); w[61] = __byte_perm_S (w[57], w[56], selector); w[60] = __byte_perm_S (w[56], w[55], selector); w[59] = __byte_perm_S (w[55], w[54], selector); w[58] = __byte_perm_S (w[54], w[53], selector); w[57] = __byte_perm_S (w[53], w[52], selector); w[56] = __byte_perm_S (w[52], w[51], selector); w[55] = __byte_perm_S (w[51], w[50], selector); w[54] = __byte_perm_S (w[50], w[49], selector); w[53] = __byte_perm_S (w[49], w[48], selector); w[52] = __byte_perm_S (w[48], w[47], selector); w[51] = __byte_perm_S (w[47], w[46], selector); w[50] = __byte_perm_S (w[46], w[45], selector); w[49] = __byte_perm_S (w[45], w[44], selector); w[48] = __byte_perm_S (w[44], w[43], selector); w[47] = __byte_perm_S (w[43], w[42], selector); w[46] = __byte_perm_S (w[42], w[41], selector); w[45] = __byte_perm_S (w[41], w[40], selector); w[44] = __byte_perm_S (w[40], w[39], selector); w[43] = __byte_perm_S (w[39], w[38], selector); w[42] = __byte_perm_S (w[38], w[37], selector); w[41] = __byte_perm_S (w[37], w[36], selector); w[40] = __byte_perm_S (w[36], w[35], selector); w[39] = __byte_perm_S (w[35], w[34], selector); w[38] = __byte_perm_S (w[34], w[33], selector); w[37] = __byte_perm_S (w[33], w[32], selector); w[36] = __byte_perm_S (w[32], w[31], selector); w[35] = __byte_perm_S (w[31], w[30], selector); w[34] = __byte_perm_S (w[30], w[29], selector); w[33] = __byte_perm_S (w[29], w[28], selector); w[32] = __byte_perm_S (w[28], w[27], selector); w[31] = __byte_perm_S (w[27], w[26], selector); w[30] = __byte_perm_S (w[26], w[25], selector); w[29] = __byte_perm_S (w[25], w[24], selector); w[28] = __byte_perm_S (w[24], w[23], selector); w[27] = __byte_perm_S (w[23], w[22], selector); w[26] = __byte_perm_S (w[22], w[21], selector); w[25] = __byte_perm_S (w[21], w[20], selector); w[24] = __byte_perm_S (w[20], w[19], selector); w[23] = __byte_perm_S (w[19], w[18], selector); w[22] = __byte_perm_S (w[18], w[17], selector); w[21] = __byte_perm_S (w[17], w[16], selector); w[20] = __byte_perm_S (w[16], w[15], selector); w[19] = __byte_perm_S (w[15], w[14], selector); w[18] = __byte_perm_S (w[14], w[13], selector); w[17] = __byte_perm_S (w[13], w[12], selector); w[16] = __byte_perm_S (w[12], w[11], selector); w[15] = __byte_perm_S (w[11], w[10], selector); w[14] = __byte_perm_S (w[10], w[ 9], selector); w[13] = __byte_perm_S (w[ 9], w[ 8], selector); w[12] = __byte_perm_S (w[ 8], w[ 7], selector); w[11] = __byte_perm_S (w[ 7], w[ 6], selector); w[10] = __byte_perm_S (w[ 6], w[ 5], selector); w[ 9] = __byte_perm_S (w[ 5], w[ 4], selector); w[ 8] = __byte_perm_S (w[ 4], w[ 3], selector); w[ 7] = __byte_perm_S (w[ 3], w[ 2], selector); w[ 6] = __byte_perm_S (w[ 2], w[ 1], selector); w[ 5] = __byte_perm_S (w[ 1], w[ 0], selector); w[ 4] = __byte_perm_S (w[ 0], 0, selector); w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 5: w[63] = __byte_perm_S (w[58], w[57], selector); w[62] = __byte_perm_S (w[57], w[56], selector); w[61] = __byte_perm_S (w[56], w[55], selector); w[60] = __byte_perm_S (w[55], w[54], selector); w[59] = __byte_perm_S (w[54], w[53], selector); w[58] = __byte_perm_S (w[53], w[52], selector); w[57] = __byte_perm_S (w[52], w[51], selector); w[56] = __byte_perm_S (w[51], w[50], selector); w[55] = __byte_perm_S (w[50], w[49], selector); w[54] = __byte_perm_S (w[49], w[48], selector); w[53] = __byte_perm_S (w[48], w[47], selector); w[52] = __byte_perm_S (w[47], w[46], selector); w[51] = __byte_perm_S (w[46], w[45], selector); w[50] = __byte_perm_S (w[45], w[44], selector); w[49] = __byte_perm_S (w[44], w[43], selector); w[48] = __byte_perm_S (w[43], w[42], selector); w[47] = __byte_perm_S (w[42], w[41], selector); w[46] = __byte_perm_S (w[41], w[40], selector); w[45] = __byte_perm_S (w[40], w[39], selector); w[44] = __byte_perm_S (w[39], w[38], selector); w[43] = __byte_perm_S (w[38], w[37], selector); w[42] = __byte_perm_S (w[37], w[36], selector); w[41] = __byte_perm_S (w[36], w[35], selector); w[40] = __byte_perm_S (w[35], w[34], selector); w[39] = __byte_perm_S (w[34], w[33], selector); w[38] = __byte_perm_S (w[33], w[32], selector); w[37] = __byte_perm_S (w[32], w[31], selector); w[36] = __byte_perm_S (w[31], w[30], selector); w[35] = __byte_perm_S (w[30], w[29], selector); w[34] = __byte_perm_S (w[29], w[28], selector); w[33] = __byte_perm_S (w[28], w[27], selector); w[32] = __byte_perm_S (w[27], w[26], selector); w[31] = __byte_perm_S (w[26], w[25], selector); w[30] = __byte_perm_S (w[25], w[24], selector); w[29] = __byte_perm_S (w[24], w[23], selector); w[28] = __byte_perm_S (w[23], w[22], selector); w[27] = __byte_perm_S (w[22], w[21], selector); w[26] = __byte_perm_S (w[21], w[20], selector); w[25] = __byte_perm_S (w[20], w[19], selector); w[24] = __byte_perm_S (w[19], w[18], selector); w[23] = __byte_perm_S (w[18], w[17], selector); w[22] = __byte_perm_S (w[17], w[16], selector); w[21] = __byte_perm_S (w[16], w[15], selector); w[20] = __byte_perm_S (w[15], w[14], selector); w[19] = __byte_perm_S (w[14], w[13], selector); w[18] = __byte_perm_S (w[13], w[12], selector); w[17] = __byte_perm_S (w[12], w[11], selector); w[16] = __byte_perm_S (w[11], w[10], selector); w[15] = __byte_perm_S (w[10], w[ 9], selector); w[14] = __byte_perm_S (w[ 9], w[ 8], selector); w[13] = __byte_perm_S (w[ 8], w[ 7], selector); w[12] = __byte_perm_S (w[ 7], w[ 6], selector); w[11] = __byte_perm_S (w[ 6], w[ 5], selector); w[10] = __byte_perm_S (w[ 5], w[ 4], selector); w[ 9] = __byte_perm_S (w[ 4], w[ 3], selector); w[ 8] = __byte_perm_S (w[ 3], w[ 2], selector); w[ 7] = __byte_perm_S (w[ 2], w[ 1], selector); w[ 6] = __byte_perm_S (w[ 1], w[ 0], selector); w[ 5] = __byte_perm_S (w[ 0], 0, selector); w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 6: w[63] = __byte_perm_S (w[57], w[56], selector); w[62] = __byte_perm_S (w[56], w[55], selector); w[61] = __byte_perm_S (w[55], w[54], selector); w[60] = __byte_perm_S (w[54], w[53], selector); w[59] = __byte_perm_S (w[53], w[52], selector); w[58] = __byte_perm_S (w[52], w[51], selector); w[57] = __byte_perm_S (w[51], w[50], selector); w[56] = __byte_perm_S (w[50], w[49], selector); w[55] = __byte_perm_S (w[49], w[48], selector); w[54] = __byte_perm_S (w[48], w[47], selector); w[53] = __byte_perm_S (w[47], w[46], selector); w[52] = __byte_perm_S (w[46], w[45], selector); w[51] = __byte_perm_S (w[45], w[44], selector); w[50] = __byte_perm_S (w[44], w[43], selector); w[49] = __byte_perm_S (w[43], w[42], selector); w[48] = __byte_perm_S (w[42], w[41], selector); w[47] = __byte_perm_S (w[41], w[40], selector); w[46] = __byte_perm_S (w[40], w[39], selector); w[45] = __byte_perm_S (w[39], w[38], selector); w[44] = __byte_perm_S (w[38], w[37], selector); w[43] = __byte_perm_S (w[37], w[36], selector); w[42] = __byte_perm_S (w[36], w[35], selector); w[41] = __byte_perm_S (w[35], w[34], selector); w[40] = __byte_perm_S (w[34], w[33], selector); w[39] = __byte_perm_S (w[33], w[32], selector); w[38] = __byte_perm_S (w[32], w[31], selector); w[37] = __byte_perm_S (w[31], w[30], selector); w[36] = __byte_perm_S (w[30], w[29], selector); w[35] = __byte_perm_S (w[29], w[28], selector); w[34] = __byte_perm_S (w[28], w[27], selector); w[33] = __byte_perm_S (w[27], w[26], selector); w[32] = __byte_perm_S (w[26], w[25], selector); w[31] = __byte_perm_S (w[25], w[24], selector); w[30] = __byte_perm_S (w[24], w[23], selector); w[29] = __byte_perm_S (w[23], w[22], selector); w[28] = __byte_perm_S (w[22], w[21], selector); w[27] = __byte_perm_S (w[21], w[20], selector); w[26] = __byte_perm_S (w[20], w[19], selector); w[25] = __byte_perm_S (w[19], w[18], selector); w[24] = __byte_perm_S (w[18], w[17], selector); w[23] = __byte_perm_S (w[17], w[16], selector); w[22] = __byte_perm_S (w[16], w[15], selector); w[21] = __byte_perm_S (w[15], w[14], selector); w[20] = __byte_perm_S (w[14], w[13], selector); w[19] = __byte_perm_S (w[13], w[12], selector); w[18] = __byte_perm_S (w[12], w[11], selector); w[17] = __byte_perm_S (w[11], w[10], selector); w[16] = __byte_perm_S (w[10], w[ 9], selector); w[15] = __byte_perm_S (w[ 9], w[ 8], selector); w[14] = __byte_perm_S (w[ 8], w[ 7], selector); w[13] = __byte_perm_S (w[ 7], w[ 6], selector); w[12] = __byte_perm_S (w[ 6], w[ 5], selector); w[11] = __byte_perm_S (w[ 5], w[ 4], selector); w[10] = __byte_perm_S (w[ 4], w[ 3], selector); w[ 9] = __byte_perm_S (w[ 3], w[ 2], selector); w[ 8] = __byte_perm_S (w[ 2], w[ 1], selector); w[ 7] = __byte_perm_S (w[ 1], w[ 0], selector); w[ 6] = __byte_perm_S (w[ 0], 0, selector); w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 7: w[63] = __byte_perm_S (w[56], w[55], selector); w[62] = __byte_perm_S (w[55], w[54], selector); w[61] = __byte_perm_S (w[54], w[53], selector); w[60] = __byte_perm_S (w[53], w[52], selector); w[59] = __byte_perm_S (w[52], w[51], selector); w[58] = __byte_perm_S (w[51], w[50], selector); w[57] = __byte_perm_S (w[50], w[49], selector); w[56] = __byte_perm_S (w[49], w[48], selector); w[55] = __byte_perm_S (w[48], w[47], selector); w[54] = __byte_perm_S (w[47], w[46], selector); w[53] = __byte_perm_S (w[46], w[45], selector); w[52] = __byte_perm_S (w[45], w[44], selector); w[51] = __byte_perm_S (w[44], w[43], selector); w[50] = __byte_perm_S (w[43], w[42], selector); w[49] = __byte_perm_S (w[42], w[41], selector); w[48] = __byte_perm_S (w[41], w[40], selector); w[47] = __byte_perm_S (w[40], w[39], selector); w[46] = __byte_perm_S (w[39], w[38], selector); w[45] = __byte_perm_S (w[38], w[37], selector); w[44] = __byte_perm_S (w[37], w[36], selector); w[43] = __byte_perm_S (w[36], w[35], selector); w[42] = __byte_perm_S (w[35], w[34], selector); w[41] = __byte_perm_S (w[34], w[33], selector); w[40] = __byte_perm_S (w[33], w[32], selector); w[39] = __byte_perm_S (w[32], w[31], selector); w[38] = __byte_perm_S (w[31], w[30], selector); w[37] = __byte_perm_S (w[30], w[29], selector); w[36] = __byte_perm_S (w[29], w[28], selector); w[35] = __byte_perm_S (w[28], w[27], selector); w[34] = __byte_perm_S (w[27], w[26], selector); w[33] = __byte_perm_S (w[26], w[25], selector); w[32] = __byte_perm_S (w[25], w[24], selector); w[31] = __byte_perm_S (w[24], w[23], selector); w[30] = __byte_perm_S (w[23], w[22], selector); w[29] = __byte_perm_S (w[22], w[21], selector); w[28] = __byte_perm_S (w[21], w[20], selector); w[27] = __byte_perm_S (w[20], w[19], selector); w[26] = __byte_perm_S (w[19], w[18], selector); w[25] = __byte_perm_S (w[18], w[17], selector); w[24] = __byte_perm_S (w[17], w[16], selector); w[23] = __byte_perm_S (w[16], w[15], selector); w[22] = __byte_perm_S (w[15], w[14], selector); w[21] = __byte_perm_S (w[14], w[13], selector); w[20] = __byte_perm_S (w[13], w[12], selector); w[19] = __byte_perm_S (w[12], w[11], selector); w[18] = __byte_perm_S (w[11], w[10], selector); w[17] = __byte_perm_S (w[10], w[ 9], selector); w[16] = __byte_perm_S (w[ 9], w[ 8], selector); w[15] = __byte_perm_S (w[ 8], w[ 7], selector); w[14] = __byte_perm_S (w[ 7], w[ 6], selector); w[13] = __byte_perm_S (w[ 6], w[ 5], selector); w[12] = __byte_perm_S (w[ 5], w[ 4], selector); w[11] = __byte_perm_S (w[ 4], w[ 3], selector); w[10] = __byte_perm_S (w[ 3], w[ 2], selector); w[ 9] = __byte_perm_S (w[ 2], w[ 1], selector); w[ 8] = __byte_perm_S (w[ 1], w[ 0], selector); w[ 7] = __byte_perm_S (w[ 0], 0, selector); w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 8: w[63] = __byte_perm_S (w[55], w[54], selector); w[62] = __byte_perm_S (w[54], w[53], selector); w[61] = __byte_perm_S (w[53], w[52], selector); w[60] = __byte_perm_S (w[52], w[51], selector); w[59] = __byte_perm_S (w[51], w[50], selector); w[58] = __byte_perm_S (w[50], w[49], selector); w[57] = __byte_perm_S (w[49], w[48], selector); w[56] = __byte_perm_S (w[48], w[47], selector); w[55] = __byte_perm_S (w[47], w[46], selector); w[54] = __byte_perm_S (w[46], w[45], selector); w[53] = __byte_perm_S (w[45], w[44], selector); w[52] = __byte_perm_S (w[44], w[43], selector); w[51] = __byte_perm_S (w[43], w[42], selector); w[50] = __byte_perm_S (w[42], w[41], selector); w[49] = __byte_perm_S (w[41], w[40], selector); w[48] = __byte_perm_S (w[40], w[39], selector); w[47] = __byte_perm_S (w[39], w[38], selector); w[46] = __byte_perm_S (w[38], w[37], selector); w[45] = __byte_perm_S (w[37], w[36], selector); w[44] = __byte_perm_S (w[36], w[35], selector); w[43] = __byte_perm_S (w[35], w[34], selector); w[42] = __byte_perm_S (w[34], w[33], selector); w[41] = __byte_perm_S (w[33], w[32], selector); w[40] = __byte_perm_S (w[32], w[31], selector); w[39] = __byte_perm_S (w[31], w[30], selector); w[38] = __byte_perm_S (w[30], w[29], selector); w[37] = __byte_perm_S (w[29], w[28], selector); w[36] = __byte_perm_S (w[28], w[27], selector); w[35] = __byte_perm_S (w[27], w[26], selector); w[34] = __byte_perm_S (w[26], w[25], selector); w[33] = __byte_perm_S (w[25], w[24], selector); w[32] = __byte_perm_S (w[24], w[23], selector); w[31] = __byte_perm_S (w[23], w[22], selector); w[30] = __byte_perm_S (w[22], w[21], selector); w[29] = __byte_perm_S (w[21], w[20], selector); w[28] = __byte_perm_S (w[20], w[19], selector); w[27] = __byte_perm_S (w[19], w[18], selector); w[26] = __byte_perm_S (w[18], w[17], selector); w[25] = __byte_perm_S (w[17], w[16], selector); w[24] = __byte_perm_S (w[16], w[15], selector); w[23] = __byte_perm_S (w[15], w[14], selector); w[22] = __byte_perm_S (w[14], w[13], selector); w[21] = __byte_perm_S (w[13], w[12], selector); w[20] = __byte_perm_S (w[12], w[11], selector); w[19] = __byte_perm_S (w[11], w[10], selector); w[18] = __byte_perm_S (w[10], w[ 9], selector); w[17] = __byte_perm_S (w[ 9], w[ 8], selector); w[16] = __byte_perm_S (w[ 8], w[ 7], selector); w[15] = __byte_perm_S (w[ 7], w[ 6], selector); w[14] = __byte_perm_S (w[ 6], w[ 5], selector); w[13] = __byte_perm_S (w[ 5], w[ 4], selector); w[12] = __byte_perm_S (w[ 4], w[ 3], selector); w[11] = __byte_perm_S (w[ 3], w[ 2], selector); w[10] = __byte_perm_S (w[ 2], w[ 1], selector); w[ 9] = __byte_perm_S (w[ 1], w[ 0], selector); w[ 8] = __byte_perm_S (w[ 0], 0, selector); w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 9: w[63] = __byte_perm_S (w[54], w[53], selector); w[62] = __byte_perm_S (w[53], w[52], selector); w[61] = __byte_perm_S (w[52], w[51], selector); w[60] = __byte_perm_S (w[51], w[50], selector); w[59] = __byte_perm_S (w[50], w[49], selector); w[58] = __byte_perm_S (w[49], w[48], selector); w[57] = __byte_perm_S (w[48], w[47], selector); w[56] = __byte_perm_S (w[47], w[46], selector); w[55] = __byte_perm_S (w[46], w[45], selector); w[54] = __byte_perm_S (w[45], w[44], selector); w[53] = __byte_perm_S (w[44], w[43], selector); w[52] = __byte_perm_S (w[43], w[42], selector); w[51] = __byte_perm_S (w[42], w[41], selector); w[50] = __byte_perm_S (w[41], w[40], selector); w[49] = __byte_perm_S (w[40], w[39], selector); w[48] = __byte_perm_S (w[39], w[38], selector); w[47] = __byte_perm_S (w[38], w[37], selector); w[46] = __byte_perm_S (w[37], w[36], selector); w[45] = __byte_perm_S (w[36], w[35], selector); w[44] = __byte_perm_S (w[35], w[34], selector); w[43] = __byte_perm_S (w[34], w[33], selector); w[42] = __byte_perm_S (w[33], w[32], selector); w[41] = __byte_perm_S (w[32], w[31], selector); w[40] = __byte_perm_S (w[31], w[30], selector); w[39] = __byte_perm_S (w[30], w[29], selector); w[38] = __byte_perm_S (w[29], w[28], selector); w[37] = __byte_perm_S (w[28], w[27], selector); w[36] = __byte_perm_S (w[27], w[26], selector); w[35] = __byte_perm_S (w[26], w[25], selector); w[34] = __byte_perm_S (w[25], w[24], selector); w[33] = __byte_perm_S (w[24], w[23], selector); w[32] = __byte_perm_S (w[23], w[22], selector); w[31] = __byte_perm_S (w[22], w[21], selector); w[30] = __byte_perm_S (w[21], w[20], selector); w[29] = __byte_perm_S (w[20], w[19], selector); w[28] = __byte_perm_S (w[19], w[18], selector); w[27] = __byte_perm_S (w[18], w[17], selector); w[26] = __byte_perm_S (w[17], w[16], selector); w[25] = __byte_perm_S (w[16], w[15], selector); w[24] = __byte_perm_S (w[15], w[14], selector); w[23] = __byte_perm_S (w[14], w[13], selector); w[22] = __byte_perm_S (w[13], w[12], selector); w[21] = __byte_perm_S (w[12], w[11], selector); w[20] = __byte_perm_S (w[11], w[10], selector); w[19] = __byte_perm_S (w[10], w[ 9], selector); w[18] = __byte_perm_S (w[ 9], w[ 8], selector); w[17] = __byte_perm_S (w[ 8], w[ 7], selector); w[16] = __byte_perm_S (w[ 7], w[ 6], selector); w[15] = __byte_perm_S (w[ 6], w[ 5], selector); w[14] = __byte_perm_S (w[ 5], w[ 4], selector); w[13] = __byte_perm_S (w[ 4], w[ 3], selector); w[12] = __byte_perm_S (w[ 3], w[ 2], selector); w[11] = __byte_perm_S (w[ 2], w[ 1], selector); w[10] = __byte_perm_S (w[ 1], w[ 0], selector); w[ 9] = __byte_perm_S (w[ 0], 0, selector); w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 10: w[63] = __byte_perm_S (w[53], w[52], selector); w[62] = __byte_perm_S (w[52], w[51], selector); w[61] = __byte_perm_S (w[51], w[50], selector); w[60] = __byte_perm_S (w[50], w[49], selector); w[59] = __byte_perm_S (w[49], w[48], selector); w[58] = __byte_perm_S (w[48], w[47], selector); w[57] = __byte_perm_S (w[47], w[46], selector); w[56] = __byte_perm_S (w[46], w[45], selector); w[55] = __byte_perm_S (w[45], w[44], selector); w[54] = __byte_perm_S (w[44], w[43], selector); w[53] = __byte_perm_S (w[43], w[42], selector); w[52] = __byte_perm_S (w[42], w[41], selector); w[51] = __byte_perm_S (w[41], w[40], selector); w[50] = __byte_perm_S (w[40], w[39], selector); w[49] = __byte_perm_S (w[39], w[38], selector); w[48] = __byte_perm_S (w[38], w[37], selector); w[47] = __byte_perm_S (w[37], w[36], selector); w[46] = __byte_perm_S (w[36], w[35], selector); w[45] = __byte_perm_S (w[35], w[34], selector); w[44] = __byte_perm_S (w[34], w[33], selector); w[43] = __byte_perm_S (w[33], w[32], selector); w[42] = __byte_perm_S (w[32], w[31], selector); w[41] = __byte_perm_S (w[31], w[30], selector); w[40] = __byte_perm_S (w[30], w[29], selector); w[39] = __byte_perm_S (w[29], w[28], selector); w[38] = __byte_perm_S (w[28], w[27], selector); w[37] = __byte_perm_S (w[27], w[26], selector); w[36] = __byte_perm_S (w[26], w[25], selector); w[35] = __byte_perm_S (w[25], w[24], selector); w[34] = __byte_perm_S (w[24], w[23], selector); w[33] = __byte_perm_S (w[23], w[22], selector); w[32] = __byte_perm_S (w[22], w[21], selector); w[31] = __byte_perm_S (w[21], w[20], selector); w[30] = __byte_perm_S (w[20], w[19], selector); w[29] = __byte_perm_S (w[19], w[18], selector); w[28] = __byte_perm_S (w[18], w[17], selector); w[27] = __byte_perm_S (w[17], w[16], selector); w[26] = __byte_perm_S (w[16], w[15], selector); w[25] = __byte_perm_S (w[15], w[14], selector); w[24] = __byte_perm_S (w[14], w[13], selector); w[23] = __byte_perm_S (w[13], w[12], selector); w[22] = __byte_perm_S (w[12], w[11], selector); w[21] = __byte_perm_S (w[11], w[10], selector); w[20] = __byte_perm_S (w[10], w[ 9], selector); w[19] = __byte_perm_S (w[ 9], w[ 8], selector); w[18] = __byte_perm_S (w[ 8], w[ 7], selector); w[17] = __byte_perm_S (w[ 7], w[ 6], selector); w[16] = __byte_perm_S (w[ 6], w[ 5], selector); w[15] = __byte_perm_S (w[ 5], w[ 4], selector); w[14] = __byte_perm_S (w[ 4], w[ 3], selector); w[13] = __byte_perm_S (w[ 3], w[ 2], selector); w[12] = __byte_perm_S (w[ 2], w[ 1], selector); w[11] = __byte_perm_S (w[ 1], w[ 0], selector); w[10] = __byte_perm_S (w[ 0], 0, selector); w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 11: w[63] = __byte_perm_S (w[52], w[51], selector); w[62] = __byte_perm_S (w[51], w[50], selector); w[61] = __byte_perm_S (w[50], w[49], selector); w[60] = __byte_perm_S (w[49], w[48], selector); w[59] = __byte_perm_S (w[48], w[47], selector); w[58] = __byte_perm_S (w[47], w[46], selector); w[57] = __byte_perm_S (w[46], w[45], selector); w[56] = __byte_perm_S (w[45], w[44], selector); w[55] = __byte_perm_S (w[44], w[43], selector); w[54] = __byte_perm_S (w[43], w[42], selector); w[53] = __byte_perm_S (w[42], w[41], selector); w[52] = __byte_perm_S (w[41], w[40], selector); w[51] = __byte_perm_S (w[40], w[39], selector); w[50] = __byte_perm_S (w[39], w[38], selector); w[49] = __byte_perm_S (w[38], w[37], selector); w[48] = __byte_perm_S (w[37], w[36], selector); w[47] = __byte_perm_S (w[36], w[35], selector); w[46] = __byte_perm_S (w[35], w[34], selector); w[45] = __byte_perm_S (w[34], w[33], selector); w[44] = __byte_perm_S (w[33], w[32], selector); w[43] = __byte_perm_S (w[32], w[31], selector); w[42] = __byte_perm_S (w[31], w[30], selector); w[41] = __byte_perm_S (w[30], w[29], selector); w[40] = __byte_perm_S (w[29], w[28], selector); w[39] = __byte_perm_S (w[28], w[27], selector); w[38] = __byte_perm_S (w[27], w[26], selector); w[37] = __byte_perm_S (w[26], w[25], selector); w[36] = __byte_perm_S (w[25], w[24], selector); w[35] = __byte_perm_S (w[24], w[23], selector); w[34] = __byte_perm_S (w[23], w[22], selector); w[33] = __byte_perm_S (w[22], w[21], selector); w[32] = __byte_perm_S (w[21], w[20], selector); w[31] = __byte_perm_S (w[20], w[19], selector); w[30] = __byte_perm_S (w[19], w[18], selector); w[29] = __byte_perm_S (w[18], w[17], selector); w[28] = __byte_perm_S (w[17], w[16], selector); w[27] = __byte_perm_S (w[16], w[15], selector); w[26] = __byte_perm_S (w[15], w[14], selector); w[25] = __byte_perm_S (w[14], w[13], selector); w[24] = __byte_perm_S (w[13], w[12], selector); w[23] = __byte_perm_S (w[12], w[11], selector); w[22] = __byte_perm_S (w[11], w[10], selector); w[21] = __byte_perm_S (w[10], w[ 9], selector); w[20] = __byte_perm_S (w[ 9], w[ 8], selector); w[19] = __byte_perm_S (w[ 8], w[ 7], selector); w[18] = __byte_perm_S (w[ 7], w[ 6], selector); w[17] = __byte_perm_S (w[ 6], w[ 5], selector); w[16] = __byte_perm_S (w[ 5], w[ 4], selector); w[15] = __byte_perm_S (w[ 4], w[ 3], selector); w[14] = __byte_perm_S (w[ 3], w[ 2], selector); w[13] = __byte_perm_S (w[ 2], w[ 1], selector); w[12] = __byte_perm_S (w[ 1], w[ 0], selector); w[11] = __byte_perm_S (w[ 0], 0, selector); w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 12: w[63] = __byte_perm_S (w[51], w[50], selector); w[62] = __byte_perm_S (w[50], w[49], selector); w[61] = __byte_perm_S (w[49], w[48], selector); w[60] = __byte_perm_S (w[48], w[47], selector); w[59] = __byte_perm_S (w[47], w[46], selector); w[58] = __byte_perm_S (w[46], w[45], selector); w[57] = __byte_perm_S (w[45], w[44], selector); w[56] = __byte_perm_S (w[44], w[43], selector); w[55] = __byte_perm_S (w[43], w[42], selector); w[54] = __byte_perm_S (w[42], w[41], selector); w[53] = __byte_perm_S (w[41], w[40], selector); w[52] = __byte_perm_S (w[40], w[39], selector); w[51] = __byte_perm_S (w[39], w[38], selector); w[50] = __byte_perm_S (w[38], w[37], selector); w[49] = __byte_perm_S (w[37], w[36], selector); w[48] = __byte_perm_S (w[36], w[35], selector); w[47] = __byte_perm_S (w[35], w[34], selector); w[46] = __byte_perm_S (w[34], w[33], selector); w[45] = __byte_perm_S (w[33], w[32], selector); w[44] = __byte_perm_S (w[32], w[31], selector); w[43] = __byte_perm_S (w[31], w[30], selector); w[42] = __byte_perm_S (w[30], w[29], selector); w[41] = __byte_perm_S (w[29], w[28], selector); w[40] = __byte_perm_S (w[28], w[27], selector); w[39] = __byte_perm_S (w[27], w[26], selector); w[38] = __byte_perm_S (w[26], w[25], selector); w[37] = __byte_perm_S (w[25], w[24], selector); w[36] = __byte_perm_S (w[24], w[23], selector); w[35] = __byte_perm_S (w[23], w[22], selector); w[34] = __byte_perm_S (w[22], w[21], selector); w[33] = __byte_perm_S (w[21], w[20], selector); w[32] = __byte_perm_S (w[20], w[19], selector); w[31] = __byte_perm_S (w[19], w[18], selector); w[30] = __byte_perm_S (w[18], w[17], selector); w[29] = __byte_perm_S (w[17], w[16], selector); w[28] = __byte_perm_S (w[16], w[15], selector); w[27] = __byte_perm_S (w[15], w[14], selector); w[26] = __byte_perm_S (w[14], w[13], selector); w[25] = __byte_perm_S (w[13], w[12], selector); w[24] = __byte_perm_S (w[12], w[11], selector); w[23] = __byte_perm_S (w[11], w[10], selector); w[22] = __byte_perm_S (w[10], w[ 9], selector); w[21] = __byte_perm_S (w[ 9], w[ 8], selector); w[20] = __byte_perm_S (w[ 8], w[ 7], selector); w[19] = __byte_perm_S (w[ 7], w[ 6], selector); w[18] = __byte_perm_S (w[ 6], w[ 5], selector); w[17] = __byte_perm_S (w[ 5], w[ 4], selector); w[16] = __byte_perm_S (w[ 4], w[ 3], selector); w[15] = __byte_perm_S (w[ 3], w[ 2], selector); w[14] = __byte_perm_S (w[ 2], w[ 1], selector); w[13] = __byte_perm_S (w[ 1], w[ 0], selector); w[12] = __byte_perm_S (w[ 0], 0, selector); w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 13: w[63] = __byte_perm_S (w[50], w[49], selector); w[62] = __byte_perm_S (w[49], w[48], selector); w[61] = __byte_perm_S (w[48], w[47], selector); w[60] = __byte_perm_S (w[47], w[46], selector); w[59] = __byte_perm_S (w[46], w[45], selector); w[58] = __byte_perm_S (w[45], w[44], selector); w[57] = __byte_perm_S (w[44], w[43], selector); w[56] = __byte_perm_S (w[43], w[42], selector); w[55] = __byte_perm_S (w[42], w[41], selector); w[54] = __byte_perm_S (w[41], w[40], selector); w[53] = __byte_perm_S (w[40], w[39], selector); w[52] = __byte_perm_S (w[39], w[38], selector); w[51] = __byte_perm_S (w[38], w[37], selector); w[50] = __byte_perm_S (w[37], w[36], selector); w[49] = __byte_perm_S (w[36], w[35], selector); w[48] = __byte_perm_S (w[35], w[34], selector); w[47] = __byte_perm_S (w[34], w[33], selector); w[46] = __byte_perm_S (w[33], w[32], selector); w[45] = __byte_perm_S (w[32], w[31], selector); w[44] = __byte_perm_S (w[31], w[30], selector); w[43] = __byte_perm_S (w[30], w[29], selector); w[42] = __byte_perm_S (w[29], w[28], selector); w[41] = __byte_perm_S (w[28], w[27], selector); w[40] = __byte_perm_S (w[27], w[26], selector); w[39] = __byte_perm_S (w[26], w[25], selector); w[38] = __byte_perm_S (w[25], w[24], selector); w[37] = __byte_perm_S (w[24], w[23], selector); w[36] = __byte_perm_S (w[23], w[22], selector); w[35] = __byte_perm_S (w[22], w[21], selector); w[34] = __byte_perm_S (w[21], w[20], selector); w[33] = __byte_perm_S (w[20], w[19], selector); w[32] = __byte_perm_S (w[19], w[18], selector); w[31] = __byte_perm_S (w[18], w[17], selector); w[30] = __byte_perm_S (w[17], w[16], selector); w[29] = __byte_perm_S (w[16], w[15], selector); w[28] = __byte_perm_S (w[15], w[14], selector); w[27] = __byte_perm_S (w[14], w[13], selector); w[26] = __byte_perm_S (w[13], w[12], selector); w[25] = __byte_perm_S (w[12], w[11], selector); w[24] = __byte_perm_S (w[11], w[10], selector); w[23] = __byte_perm_S (w[10], w[ 9], selector); w[22] = __byte_perm_S (w[ 9], w[ 8], selector); w[21] = __byte_perm_S (w[ 8], w[ 7], selector); w[20] = __byte_perm_S (w[ 7], w[ 6], selector); w[19] = __byte_perm_S (w[ 6], w[ 5], selector); w[18] = __byte_perm_S (w[ 5], w[ 4], selector); w[17] = __byte_perm_S (w[ 4], w[ 3], selector); w[16] = __byte_perm_S (w[ 3], w[ 2], selector); w[15] = __byte_perm_S (w[ 2], w[ 1], selector); w[14] = __byte_perm_S (w[ 1], w[ 0], selector); w[13] = __byte_perm_S (w[ 0], 0, selector); w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 14: w[63] = __byte_perm_S (w[49], w[48], selector); w[62] = __byte_perm_S (w[48], w[47], selector); w[61] = __byte_perm_S (w[47], w[46], selector); w[60] = __byte_perm_S (w[46], w[45], selector); w[59] = __byte_perm_S (w[45], w[44], selector); w[58] = __byte_perm_S (w[44], w[43], selector); w[57] = __byte_perm_S (w[43], w[42], selector); w[56] = __byte_perm_S (w[42], w[41], selector); w[55] = __byte_perm_S (w[41], w[40], selector); w[54] = __byte_perm_S (w[40], w[39], selector); w[53] = __byte_perm_S (w[39], w[38], selector); w[52] = __byte_perm_S (w[38], w[37], selector); w[51] = __byte_perm_S (w[37], w[36], selector); w[50] = __byte_perm_S (w[36], w[35], selector); w[49] = __byte_perm_S (w[35], w[34], selector); w[48] = __byte_perm_S (w[34], w[33], selector); w[47] = __byte_perm_S (w[33], w[32], selector); w[46] = __byte_perm_S (w[32], w[31], selector); w[45] = __byte_perm_S (w[31], w[30], selector); w[44] = __byte_perm_S (w[30], w[29], selector); w[43] = __byte_perm_S (w[29], w[28], selector); w[42] = __byte_perm_S (w[28], w[27], selector); w[41] = __byte_perm_S (w[27], w[26], selector); w[40] = __byte_perm_S (w[26], w[25], selector); w[39] = __byte_perm_S (w[25], w[24], selector); w[38] = __byte_perm_S (w[24], w[23], selector); w[37] = __byte_perm_S (w[23], w[22], selector); w[36] = __byte_perm_S (w[22], w[21], selector); w[35] = __byte_perm_S (w[21], w[20], selector); w[34] = __byte_perm_S (w[20], w[19], selector); w[33] = __byte_perm_S (w[19], w[18], selector); w[32] = __byte_perm_S (w[18], w[17], selector); w[31] = __byte_perm_S (w[17], w[16], selector); w[30] = __byte_perm_S (w[16], w[15], selector); w[29] = __byte_perm_S (w[15], w[14], selector); w[28] = __byte_perm_S (w[14], w[13], selector); w[27] = __byte_perm_S (w[13], w[12], selector); w[26] = __byte_perm_S (w[12], w[11], selector); w[25] = __byte_perm_S (w[11], w[10], selector); w[24] = __byte_perm_S (w[10], w[ 9], selector); w[23] = __byte_perm_S (w[ 9], w[ 8], selector); w[22] = __byte_perm_S (w[ 8], w[ 7], selector); w[21] = __byte_perm_S (w[ 7], w[ 6], selector); w[20] = __byte_perm_S (w[ 6], w[ 5], selector); w[19] = __byte_perm_S (w[ 5], w[ 4], selector); w[18] = __byte_perm_S (w[ 4], w[ 3], selector); w[17] = __byte_perm_S (w[ 3], w[ 2], selector); w[16] = __byte_perm_S (w[ 2], w[ 1], selector); w[15] = __byte_perm_S (w[ 1], w[ 0], selector); w[14] = __byte_perm_S (w[ 0], 0, selector); w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 15: w[63] = __byte_perm_S (w[48], w[47], selector); w[62] = __byte_perm_S (w[47], w[46], selector); w[61] = __byte_perm_S (w[46], w[45], selector); w[60] = __byte_perm_S (w[45], w[44], selector); w[59] = __byte_perm_S (w[44], w[43], selector); w[58] = __byte_perm_S (w[43], w[42], selector); w[57] = __byte_perm_S (w[42], w[41], selector); w[56] = __byte_perm_S (w[41], w[40], selector); w[55] = __byte_perm_S (w[40], w[39], selector); w[54] = __byte_perm_S (w[39], w[38], selector); w[53] = __byte_perm_S (w[38], w[37], selector); w[52] = __byte_perm_S (w[37], w[36], selector); w[51] = __byte_perm_S (w[36], w[35], selector); w[50] = __byte_perm_S (w[35], w[34], selector); w[49] = __byte_perm_S (w[34], w[33], selector); w[48] = __byte_perm_S (w[33], w[32], selector); w[47] = __byte_perm_S (w[32], w[31], selector); w[46] = __byte_perm_S (w[31], w[30], selector); w[45] = __byte_perm_S (w[30], w[29], selector); w[44] = __byte_perm_S (w[29], w[28], selector); w[43] = __byte_perm_S (w[28], w[27], selector); w[42] = __byte_perm_S (w[27], w[26], selector); w[41] = __byte_perm_S (w[26], w[25], selector); w[40] = __byte_perm_S (w[25], w[24], selector); w[39] = __byte_perm_S (w[24], w[23], selector); w[38] = __byte_perm_S (w[23], w[22], selector); w[37] = __byte_perm_S (w[22], w[21], selector); w[36] = __byte_perm_S (w[21], w[20], selector); w[35] = __byte_perm_S (w[20], w[19], selector); w[34] = __byte_perm_S (w[19], w[18], selector); w[33] = __byte_perm_S (w[18], w[17], selector); w[32] = __byte_perm_S (w[17], w[16], selector); w[31] = __byte_perm_S (w[16], w[15], selector); w[30] = __byte_perm_S (w[15], w[14], selector); w[29] = __byte_perm_S (w[14], w[13], selector); w[28] = __byte_perm_S (w[13], w[12], selector); w[27] = __byte_perm_S (w[12], w[11], selector); w[26] = __byte_perm_S (w[11], w[10], selector); w[25] = __byte_perm_S (w[10], w[ 9], selector); w[24] = __byte_perm_S (w[ 9], w[ 8], selector); w[23] = __byte_perm_S (w[ 8], w[ 7], selector); w[22] = __byte_perm_S (w[ 7], w[ 6], selector); w[21] = __byte_perm_S (w[ 6], w[ 5], selector); w[20] = __byte_perm_S (w[ 5], w[ 4], selector); w[19] = __byte_perm_S (w[ 4], w[ 3], selector); w[18] = __byte_perm_S (w[ 3], w[ 2], selector); w[17] = __byte_perm_S (w[ 2], w[ 1], selector); w[16] = __byte_perm_S (w[ 1], w[ 0], selector); w[15] = __byte_perm_S (w[ 0], 0, selector); w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 16: w[63] = __byte_perm_S (w[47], w[46], selector); w[62] = __byte_perm_S (w[46], w[45], selector); w[61] = __byte_perm_S (w[45], w[44], selector); w[60] = __byte_perm_S (w[44], w[43], selector); w[59] = __byte_perm_S (w[43], w[42], selector); w[58] = __byte_perm_S (w[42], w[41], selector); w[57] = __byte_perm_S (w[41], w[40], selector); w[56] = __byte_perm_S (w[40], w[39], selector); w[55] = __byte_perm_S (w[39], w[38], selector); w[54] = __byte_perm_S (w[38], w[37], selector); w[53] = __byte_perm_S (w[37], w[36], selector); w[52] = __byte_perm_S (w[36], w[35], selector); w[51] = __byte_perm_S (w[35], w[34], selector); w[50] = __byte_perm_S (w[34], w[33], selector); w[49] = __byte_perm_S (w[33], w[32], selector); w[48] = __byte_perm_S (w[32], w[31], selector); w[47] = __byte_perm_S (w[31], w[30], selector); w[46] = __byte_perm_S (w[30], w[29], selector); w[45] = __byte_perm_S (w[29], w[28], selector); w[44] = __byte_perm_S (w[28], w[27], selector); w[43] = __byte_perm_S (w[27], w[26], selector); w[42] = __byte_perm_S (w[26], w[25], selector); w[41] = __byte_perm_S (w[25], w[24], selector); w[40] = __byte_perm_S (w[24], w[23], selector); w[39] = __byte_perm_S (w[23], w[22], selector); w[38] = __byte_perm_S (w[22], w[21], selector); w[37] = __byte_perm_S (w[21], w[20], selector); w[36] = __byte_perm_S (w[20], w[19], selector); w[35] = __byte_perm_S (w[19], w[18], selector); w[34] = __byte_perm_S (w[18], w[17], selector); w[33] = __byte_perm_S (w[17], w[16], selector); w[32] = __byte_perm_S (w[16], w[15], selector); w[31] = __byte_perm_S (w[15], w[14], selector); w[30] = __byte_perm_S (w[14], w[13], selector); w[29] = __byte_perm_S (w[13], w[12], selector); w[28] = __byte_perm_S (w[12], w[11], selector); w[27] = __byte_perm_S (w[11], w[10], selector); w[26] = __byte_perm_S (w[10], w[ 9], selector); w[25] = __byte_perm_S (w[ 9], w[ 8], selector); w[24] = __byte_perm_S (w[ 8], w[ 7], selector); w[23] = __byte_perm_S (w[ 7], w[ 6], selector); w[22] = __byte_perm_S (w[ 6], w[ 5], selector); w[21] = __byte_perm_S (w[ 5], w[ 4], selector); w[20] = __byte_perm_S (w[ 4], w[ 3], selector); w[19] = __byte_perm_S (w[ 3], w[ 2], selector); w[18] = __byte_perm_S (w[ 2], w[ 1], selector); w[17] = __byte_perm_S (w[ 1], w[ 0], selector); w[16] = __byte_perm_S (w[ 0], 0, selector); w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 17: w[63] = __byte_perm_S (w[46], w[45], selector); w[62] = __byte_perm_S (w[45], w[44], selector); w[61] = __byte_perm_S (w[44], w[43], selector); w[60] = __byte_perm_S (w[43], w[42], selector); w[59] = __byte_perm_S (w[42], w[41], selector); w[58] = __byte_perm_S (w[41], w[40], selector); w[57] = __byte_perm_S (w[40], w[39], selector); w[56] = __byte_perm_S (w[39], w[38], selector); w[55] = __byte_perm_S (w[38], w[37], selector); w[54] = __byte_perm_S (w[37], w[36], selector); w[53] = __byte_perm_S (w[36], w[35], selector); w[52] = __byte_perm_S (w[35], w[34], selector); w[51] = __byte_perm_S (w[34], w[33], selector); w[50] = __byte_perm_S (w[33], w[32], selector); w[49] = __byte_perm_S (w[32], w[31], selector); w[48] = __byte_perm_S (w[31], w[30], selector); w[47] = __byte_perm_S (w[30], w[29], selector); w[46] = __byte_perm_S (w[29], w[28], selector); w[45] = __byte_perm_S (w[28], w[27], selector); w[44] = __byte_perm_S (w[27], w[26], selector); w[43] = __byte_perm_S (w[26], w[25], selector); w[42] = __byte_perm_S (w[25], w[24], selector); w[41] = __byte_perm_S (w[24], w[23], selector); w[40] = __byte_perm_S (w[23], w[22], selector); w[39] = __byte_perm_S (w[22], w[21], selector); w[38] = __byte_perm_S (w[21], w[20], selector); w[37] = __byte_perm_S (w[20], w[19], selector); w[36] = __byte_perm_S (w[19], w[18], selector); w[35] = __byte_perm_S (w[18], w[17], selector); w[34] = __byte_perm_S (w[17], w[16], selector); w[33] = __byte_perm_S (w[16], w[15], selector); w[32] = __byte_perm_S (w[15], w[14], selector); w[31] = __byte_perm_S (w[14], w[13], selector); w[30] = __byte_perm_S (w[13], w[12], selector); w[29] = __byte_perm_S (w[12], w[11], selector); w[28] = __byte_perm_S (w[11], w[10], selector); w[27] = __byte_perm_S (w[10], w[ 9], selector); w[26] = __byte_perm_S (w[ 9], w[ 8], selector); w[25] = __byte_perm_S (w[ 8], w[ 7], selector); w[24] = __byte_perm_S (w[ 7], w[ 6], selector); w[23] = __byte_perm_S (w[ 6], w[ 5], selector); w[22] = __byte_perm_S (w[ 5], w[ 4], selector); w[21] = __byte_perm_S (w[ 4], w[ 3], selector); w[20] = __byte_perm_S (w[ 3], w[ 2], selector); w[19] = __byte_perm_S (w[ 2], w[ 1], selector); w[18] = __byte_perm_S (w[ 1], w[ 0], selector); w[17] = __byte_perm_S (w[ 0], 0, selector); w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 18: w[63] = __byte_perm_S (w[45], w[44], selector); w[62] = __byte_perm_S (w[44], w[43], selector); w[61] = __byte_perm_S (w[43], w[42], selector); w[60] = __byte_perm_S (w[42], w[41], selector); w[59] = __byte_perm_S (w[41], w[40], selector); w[58] = __byte_perm_S (w[40], w[39], selector); w[57] = __byte_perm_S (w[39], w[38], selector); w[56] = __byte_perm_S (w[38], w[37], selector); w[55] = __byte_perm_S (w[37], w[36], selector); w[54] = __byte_perm_S (w[36], w[35], selector); w[53] = __byte_perm_S (w[35], w[34], selector); w[52] = __byte_perm_S (w[34], w[33], selector); w[51] = __byte_perm_S (w[33], w[32], selector); w[50] = __byte_perm_S (w[32], w[31], selector); w[49] = __byte_perm_S (w[31], w[30], selector); w[48] = __byte_perm_S (w[30], w[29], selector); w[47] = __byte_perm_S (w[29], w[28], selector); w[46] = __byte_perm_S (w[28], w[27], selector); w[45] = __byte_perm_S (w[27], w[26], selector); w[44] = __byte_perm_S (w[26], w[25], selector); w[43] = __byte_perm_S (w[25], w[24], selector); w[42] = __byte_perm_S (w[24], w[23], selector); w[41] = __byte_perm_S (w[23], w[22], selector); w[40] = __byte_perm_S (w[22], w[21], selector); w[39] = __byte_perm_S (w[21], w[20], selector); w[38] = __byte_perm_S (w[20], w[19], selector); w[37] = __byte_perm_S (w[19], w[18], selector); w[36] = __byte_perm_S (w[18], w[17], selector); w[35] = __byte_perm_S (w[17], w[16], selector); w[34] = __byte_perm_S (w[16], w[15], selector); w[33] = __byte_perm_S (w[15], w[14], selector); w[32] = __byte_perm_S (w[14], w[13], selector); w[31] = __byte_perm_S (w[13], w[12], selector); w[30] = __byte_perm_S (w[12], w[11], selector); w[29] = __byte_perm_S (w[11], w[10], selector); w[28] = __byte_perm_S (w[10], w[ 9], selector); w[27] = __byte_perm_S (w[ 9], w[ 8], selector); w[26] = __byte_perm_S (w[ 8], w[ 7], selector); w[25] = __byte_perm_S (w[ 7], w[ 6], selector); w[24] = __byte_perm_S (w[ 6], w[ 5], selector); w[23] = __byte_perm_S (w[ 5], w[ 4], selector); w[22] = __byte_perm_S (w[ 4], w[ 3], selector); w[21] = __byte_perm_S (w[ 3], w[ 2], selector); w[20] = __byte_perm_S (w[ 2], w[ 1], selector); w[19] = __byte_perm_S (w[ 1], w[ 0], selector); w[18] = __byte_perm_S (w[ 0], 0, selector); w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 19: w[63] = __byte_perm_S (w[44], w[43], selector); w[62] = __byte_perm_S (w[43], w[42], selector); w[61] = __byte_perm_S (w[42], w[41], selector); w[60] = __byte_perm_S (w[41], w[40], selector); w[59] = __byte_perm_S (w[40], w[39], selector); w[58] = __byte_perm_S (w[39], w[38], selector); w[57] = __byte_perm_S (w[38], w[37], selector); w[56] = __byte_perm_S (w[37], w[36], selector); w[55] = __byte_perm_S (w[36], w[35], selector); w[54] = __byte_perm_S (w[35], w[34], selector); w[53] = __byte_perm_S (w[34], w[33], selector); w[52] = __byte_perm_S (w[33], w[32], selector); w[51] = __byte_perm_S (w[32], w[31], selector); w[50] = __byte_perm_S (w[31], w[30], selector); w[49] = __byte_perm_S (w[30], w[29], selector); w[48] = __byte_perm_S (w[29], w[28], selector); w[47] = __byte_perm_S (w[28], w[27], selector); w[46] = __byte_perm_S (w[27], w[26], selector); w[45] = __byte_perm_S (w[26], w[25], selector); w[44] = __byte_perm_S (w[25], w[24], selector); w[43] = __byte_perm_S (w[24], w[23], selector); w[42] = __byte_perm_S (w[23], w[22], selector); w[41] = __byte_perm_S (w[22], w[21], selector); w[40] = __byte_perm_S (w[21], w[20], selector); w[39] = __byte_perm_S (w[20], w[19], selector); w[38] = __byte_perm_S (w[19], w[18], selector); w[37] = __byte_perm_S (w[18], w[17], selector); w[36] = __byte_perm_S (w[17], w[16], selector); w[35] = __byte_perm_S (w[16], w[15], selector); w[34] = __byte_perm_S (w[15], w[14], selector); w[33] = __byte_perm_S (w[14], w[13], selector); w[32] = __byte_perm_S (w[13], w[12], selector); w[31] = __byte_perm_S (w[12], w[11], selector); w[30] = __byte_perm_S (w[11], w[10], selector); w[29] = __byte_perm_S (w[10], w[ 9], selector); w[28] = __byte_perm_S (w[ 9], w[ 8], selector); w[27] = __byte_perm_S (w[ 8], w[ 7], selector); w[26] = __byte_perm_S (w[ 7], w[ 6], selector); w[25] = __byte_perm_S (w[ 6], w[ 5], selector); w[24] = __byte_perm_S (w[ 5], w[ 4], selector); w[23] = __byte_perm_S (w[ 4], w[ 3], selector); w[22] = __byte_perm_S (w[ 3], w[ 2], selector); w[21] = __byte_perm_S (w[ 2], w[ 1], selector); w[20] = __byte_perm_S (w[ 1], w[ 0], selector); w[19] = __byte_perm_S (w[ 0], 0, selector); w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 20: w[63] = __byte_perm_S (w[43], w[42], selector); w[62] = __byte_perm_S (w[42], w[41], selector); w[61] = __byte_perm_S (w[41], w[40], selector); w[60] = __byte_perm_S (w[40], w[39], selector); w[59] = __byte_perm_S (w[39], w[38], selector); w[58] = __byte_perm_S (w[38], w[37], selector); w[57] = __byte_perm_S (w[37], w[36], selector); w[56] = __byte_perm_S (w[36], w[35], selector); w[55] = __byte_perm_S (w[35], w[34], selector); w[54] = __byte_perm_S (w[34], w[33], selector); w[53] = __byte_perm_S (w[33], w[32], selector); w[52] = __byte_perm_S (w[32], w[31], selector); w[51] = __byte_perm_S (w[31], w[30], selector); w[50] = __byte_perm_S (w[30], w[29], selector); w[49] = __byte_perm_S (w[29], w[28], selector); w[48] = __byte_perm_S (w[28], w[27], selector); w[47] = __byte_perm_S (w[27], w[26], selector); w[46] = __byte_perm_S (w[26], w[25], selector); w[45] = __byte_perm_S (w[25], w[24], selector); w[44] = __byte_perm_S (w[24], w[23], selector); w[43] = __byte_perm_S (w[23], w[22], selector); w[42] = __byte_perm_S (w[22], w[21], selector); w[41] = __byte_perm_S (w[21], w[20], selector); w[40] = __byte_perm_S (w[20], w[19], selector); w[39] = __byte_perm_S (w[19], w[18], selector); w[38] = __byte_perm_S (w[18], w[17], selector); w[37] = __byte_perm_S (w[17], w[16], selector); w[36] = __byte_perm_S (w[16], w[15], selector); w[35] = __byte_perm_S (w[15], w[14], selector); w[34] = __byte_perm_S (w[14], w[13], selector); w[33] = __byte_perm_S (w[13], w[12], selector); w[32] = __byte_perm_S (w[12], w[11], selector); w[31] = __byte_perm_S (w[11], w[10], selector); w[30] = __byte_perm_S (w[10], w[ 9], selector); w[29] = __byte_perm_S (w[ 9], w[ 8], selector); w[28] = __byte_perm_S (w[ 8], w[ 7], selector); w[27] = __byte_perm_S (w[ 7], w[ 6], selector); w[26] = __byte_perm_S (w[ 6], w[ 5], selector); w[25] = __byte_perm_S (w[ 5], w[ 4], selector); w[24] = __byte_perm_S (w[ 4], w[ 3], selector); w[23] = __byte_perm_S (w[ 3], w[ 2], selector); w[22] = __byte_perm_S (w[ 2], w[ 1], selector); w[21] = __byte_perm_S (w[ 1], w[ 0], selector); w[20] = __byte_perm_S (w[ 0], 0, selector); w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 21: w[63] = __byte_perm_S (w[42], w[41], selector); w[62] = __byte_perm_S (w[41], w[40], selector); w[61] = __byte_perm_S (w[40], w[39], selector); w[60] = __byte_perm_S (w[39], w[38], selector); w[59] = __byte_perm_S (w[38], w[37], selector); w[58] = __byte_perm_S (w[37], w[36], selector); w[57] = __byte_perm_S (w[36], w[35], selector); w[56] = __byte_perm_S (w[35], w[34], selector); w[55] = __byte_perm_S (w[34], w[33], selector); w[54] = __byte_perm_S (w[33], w[32], selector); w[53] = __byte_perm_S (w[32], w[31], selector); w[52] = __byte_perm_S (w[31], w[30], selector); w[51] = __byte_perm_S (w[30], w[29], selector); w[50] = __byte_perm_S (w[29], w[28], selector); w[49] = __byte_perm_S (w[28], w[27], selector); w[48] = __byte_perm_S (w[27], w[26], selector); w[47] = __byte_perm_S (w[26], w[25], selector); w[46] = __byte_perm_S (w[25], w[24], selector); w[45] = __byte_perm_S (w[24], w[23], selector); w[44] = __byte_perm_S (w[23], w[22], selector); w[43] = __byte_perm_S (w[22], w[21], selector); w[42] = __byte_perm_S (w[21], w[20], selector); w[41] = __byte_perm_S (w[20], w[19], selector); w[40] = __byte_perm_S (w[19], w[18], selector); w[39] = __byte_perm_S (w[18], w[17], selector); w[38] = __byte_perm_S (w[17], w[16], selector); w[37] = __byte_perm_S (w[16], w[15], selector); w[36] = __byte_perm_S (w[15], w[14], selector); w[35] = __byte_perm_S (w[14], w[13], selector); w[34] = __byte_perm_S (w[13], w[12], selector); w[33] = __byte_perm_S (w[12], w[11], selector); w[32] = __byte_perm_S (w[11], w[10], selector); w[31] = __byte_perm_S (w[10], w[ 9], selector); w[30] = __byte_perm_S (w[ 9], w[ 8], selector); w[29] = __byte_perm_S (w[ 8], w[ 7], selector); w[28] = __byte_perm_S (w[ 7], w[ 6], selector); w[27] = __byte_perm_S (w[ 6], w[ 5], selector); w[26] = __byte_perm_S (w[ 5], w[ 4], selector); w[25] = __byte_perm_S (w[ 4], w[ 3], selector); w[24] = __byte_perm_S (w[ 3], w[ 2], selector); w[23] = __byte_perm_S (w[ 2], w[ 1], selector); w[22] = __byte_perm_S (w[ 1], w[ 0], selector); w[21] = __byte_perm_S (w[ 0], 0, selector); w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 22: w[63] = __byte_perm_S (w[41], w[40], selector); w[62] = __byte_perm_S (w[40], w[39], selector); w[61] = __byte_perm_S (w[39], w[38], selector); w[60] = __byte_perm_S (w[38], w[37], selector); w[59] = __byte_perm_S (w[37], w[36], selector); w[58] = __byte_perm_S (w[36], w[35], selector); w[57] = __byte_perm_S (w[35], w[34], selector); w[56] = __byte_perm_S (w[34], w[33], selector); w[55] = __byte_perm_S (w[33], w[32], selector); w[54] = __byte_perm_S (w[32], w[31], selector); w[53] = __byte_perm_S (w[31], w[30], selector); w[52] = __byte_perm_S (w[30], w[29], selector); w[51] = __byte_perm_S (w[29], w[28], selector); w[50] = __byte_perm_S (w[28], w[27], selector); w[49] = __byte_perm_S (w[27], w[26], selector); w[48] = __byte_perm_S (w[26], w[25], selector); w[47] = __byte_perm_S (w[25], w[24], selector); w[46] = __byte_perm_S (w[24], w[23], selector); w[45] = __byte_perm_S (w[23], w[22], selector); w[44] = __byte_perm_S (w[22], w[21], selector); w[43] = __byte_perm_S (w[21], w[20], selector); w[42] = __byte_perm_S (w[20], w[19], selector); w[41] = __byte_perm_S (w[19], w[18], selector); w[40] = __byte_perm_S (w[18], w[17], selector); w[39] = __byte_perm_S (w[17], w[16], selector); w[38] = __byte_perm_S (w[16], w[15], selector); w[37] = __byte_perm_S (w[15], w[14], selector); w[36] = __byte_perm_S (w[14], w[13], selector); w[35] = __byte_perm_S (w[13], w[12], selector); w[34] = __byte_perm_S (w[12], w[11], selector); w[33] = __byte_perm_S (w[11], w[10], selector); w[32] = __byte_perm_S (w[10], w[ 9], selector); w[31] = __byte_perm_S (w[ 9], w[ 8], selector); w[30] = __byte_perm_S (w[ 8], w[ 7], selector); w[29] = __byte_perm_S (w[ 7], w[ 6], selector); w[28] = __byte_perm_S (w[ 6], w[ 5], selector); w[27] = __byte_perm_S (w[ 5], w[ 4], selector); w[26] = __byte_perm_S (w[ 4], w[ 3], selector); w[25] = __byte_perm_S (w[ 3], w[ 2], selector); w[24] = __byte_perm_S (w[ 2], w[ 1], selector); w[23] = __byte_perm_S (w[ 1], w[ 0], selector); w[22] = __byte_perm_S (w[ 0], 0, selector); w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 23: w[63] = __byte_perm_S (w[40], w[39], selector); w[62] = __byte_perm_S (w[39], w[38], selector); w[61] = __byte_perm_S (w[38], w[37], selector); w[60] = __byte_perm_S (w[37], w[36], selector); w[59] = __byte_perm_S (w[36], w[35], selector); w[58] = __byte_perm_S (w[35], w[34], selector); w[57] = __byte_perm_S (w[34], w[33], selector); w[56] = __byte_perm_S (w[33], w[32], selector); w[55] = __byte_perm_S (w[32], w[31], selector); w[54] = __byte_perm_S (w[31], w[30], selector); w[53] = __byte_perm_S (w[30], w[29], selector); w[52] = __byte_perm_S (w[29], w[28], selector); w[51] = __byte_perm_S (w[28], w[27], selector); w[50] = __byte_perm_S (w[27], w[26], selector); w[49] = __byte_perm_S (w[26], w[25], selector); w[48] = __byte_perm_S (w[25], w[24], selector); w[47] = __byte_perm_S (w[24], w[23], selector); w[46] = __byte_perm_S (w[23], w[22], selector); w[45] = __byte_perm_S (w[22], w[21], selector); w[44] = __byte_perm_S (w[21], w[20], selector); w[43] = __byte_perm_S (w[20], w[19], selector); w[42] = __byte_perm_S (w[19], w[18], selector); w[41] = __byte_perm_S (w[18], w[17], selector); w[40] = __byte_perm_S (w[17], w[16], selector); w[39] = __byte_perm_S (w[16], w[15], selector); w[38] = __byte_perm_S (w[15], w[14], selector); w[37] = __byte_perm_S (w[14], w[13], selector); w[36] = __byte_perm_S (w[13], w[12], selector); w[35] = __byte_perm_S (w[12], w[11], selector); w[34] = __byte_perm_S (w[11], w[10], selector); w[33] = __byte_perm_S (w[10], w[ 9], selector); w[32] = __byte_perm_S (w[ 9], w[ 8], selector); w[31] = __byte_perm_S (w[ 8], w[ 7], selector); w[30] = __byte_perm_S (w[ 7], w[ 6], selector); w[29] = __byte_perm_S (w[ 6], w[ 5], selector); w[28] = __byte_perm_S (w[ 5], w[ 4], selector); w[27] = __byte_perm_S (w[ 4], w[ 3], selector); w[26] = __byte_perm_S (w[ 3], w[ 2], selector); w[25] = __byte_perm_S (w[ 2], w[ 1], selector); w[24] = __byte_perm_S (w[ 1], w[ 0], selector); w[23] = __byte_perm_S (w[ 0], 0, selector); w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 24: w[63] = __byte_perm_S (w[39], w[38], selector); w[62] = __byte_perm_S (w[38], w[37], selector); w[61] = __byte_perm_S (w[37], w[36], selector); w[60] = __byte_perm_S (w[36], w[35], selector); w[59] = __byte_perm_S (w[35], w[34], selector); w[58] = __byte_perm_S (w[34], w[33], selector); w[57] = __byte_perm_S (w[33], w[32], selector); w[56] = __byte_perm_S (w[32], w[31], selector); w[55] = __byte_perm_S (w[31], w[30], selector); w[54] = __byte_perm_S (w[30], w[29], selector); w[53] = __byte_perm_S (w[29], w[28], selector); w[52] = __byte_perm_S (w[28], w[27], selector); w[51] = __byte_perm_S (w[27], w[26], selector); w[50] = __byte_perm_S (w[26], w[25], selector); w[49] = __byte_perm_S (w[25], w[24], selector); w[48] = __byte_perm_S (w[24], w[23], selector); w[47] = __byte_perm_S (w[23], w[22], selector); w[46] = __byte_perm_S (w[22], w[21], selector); w[45] = __byte_perm_S (w[21], w[20], selector); w[44] = __byte_perm_S (w[20], w[19], selector); w[43] = __byte_perm_S (w[19], w[18], selector); w[42] = __byte_perm_S (w[18], w[17], selector); w[41] = __byte_perm_S (w[17], w[16], selector); w[40] = __byte_perm_S (w[16], w[15], selector); w[39] = __byte_perm_S (w[15], w[14], selector); w[38] = __byte_perm_S (w[14], w[13], selector); w[37] = __byte_perm_S (w[13], w[12], selector); w[36] = __byte_perm_S (w[12], w[11], selector); w[35] = __byte_perm_S (w[11], w[10], selector); w[34] = __byte_perm_S (w[10], w[ 9], selector); w[33] = __byte_perm_S (w[ 9], w[ 8], selector); w[32] = __byte_perm_S (w[ 8], w[ 7], selector); w[31] = __byte_perm_S (w[ 7], w[ 6], selector); w[30] = __byte_perm_S (w[ 6], w[ 5], selector); w[29] = __byte_perm_S (w[ 5], w[ 4], selector); w[28] = __byte_perm_S (w[ 4], w[ 3], selector); w[27] = __byte_perm_S (w[ 3], w[ 2], selector); w[26] = __byte_perm_S (w[ 2], w[ 1], selector); w[25] = __byte_perm_S (w[ 1], w[ 0], selector); w[24] = __byte_perm_S (w[ 0], 0, selector); w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 25: w[63] = __byte_perm_S (w[38], w[37], selector); w[62] = __byte_perm_S (w[37], w[36], selector); w[61] = __byte_perm_S (w[36], w[35], selector); w[60] = __byte_perm_S (w[35], w[34], selector); w[59] = __byte_perm_S (w[34], w[33], selector); w[58] = __byte_perm_S (w[33], w[32], selector); w[57] = __byte_perm_S (w[32], w[31], selector); w[56] = __byte_perm_S (w[31], w[30], selector); w[55] = __byte_perm_S (w[30], w[29], selector); w[54] = __byte_perm_S (w[29], w[28], selector); w[53] = __byte_perm_S (w[28], w[27], selector); w[52] = __byte_perm_S (w[27], w[26], selector); w[51] = __byte_perm_S (w[26], w[25], selector); w[50] = __byte_perm_S (w[25], w[24], selector); w[49] = __byte_perm_S (w[24], w[23], selector); w[48] = __byte_perm_S (w[23], w[22], selector); w[47] = __byte_perm_S (w[22], w[21], selector); w[46] = __byte_perm_S (w[21], w[20], selector); w[45] = __byte_perm_S (w[20], w[19], selector); w[44] = __byte_perm_S (w[19], w[18], selector); w[43] = __byte_perm_S (w[18], w[17], selector); w[42] = __byte_perm_S (w[17], w[16], selector); w[41] = __byte_perm_S (w[16], w[15], selector); w[40] = __byte_perm_S (w[15], w[14], selector); w[39] = __byte_perm_S (w[14], w[13], selector); w[38] = __byte_perm_S (w[13], w[12], selector); w[37] = __byte_perm_S (w[12], w[11], selector); w[36] = __byte_perm_S (w[11], w[10], selector); w[35] = __byte_perm_S (w[10], w[ 9], selector); w[34] = __byte_perm_S (w[ 9], w[ 8], selector); w[33] = __byte_perm_S (w[ 8], w[ 7], selector); w[32] = __byte_perm_S (w[ 7], w[ 6], selector); w[31] = __byte_perm_S (w[ 6], w[ 5], selector); w[30] = __byte_perm_S (w[ 5], w[ 4], selector); w[29] = __byte_perm_S (w[ 4], w[ 3], selector); w[28] = __byte_perm_S (w[ 3], w[ 2], selector); w[27] = __byte_perm_S (w[ 2], w[ 1], selector); w[26] = __byte_perm_S (w[ 1], w[ 0], selector); w[25] = __byte_perm_S (w[ 0], 0, selector); w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 26: w[63] = __byte_perm_S (w[37], w[36], selector); w[62] = __byte_perm_S (w[36], w[35], selector); w[61] = __byte_perm_S (w[35], w[34], selector); w[60] = __byte_perm_S (w[34], w[33], selector); w[59] = __byte_perm_S (w[33], w[32], selector); w[58] = __byte_perm_S (w[32], w[31], selector); w[57] = __byte_perm_S (w[31], w[30], selector); w[56] = __byte_perm_S (w[30], w[29], selector); w[55] = __byte_perm_S (w[29], w[28], selector); w[54] = __byte_perm_S (w[28], w[27], selector); w[53] = __byte_perm_S (w[27], w[26], selector); w[52] = __byte_perm_S (w[26], w[25], selector); w[51] = __byte_perm_S (w[25], w[24], selector); w[50] = __byte_perm_S (w[24], w[23], selector); w[49] = __byte_perm_S (w[23], w[22], selector); w[48] = __byte_perm_S (w[22], w[21], selector); w[47] = __byte_perm_S (w[21], w[20], selector); w[46] = __byte_perm_S (w[20], w[19], selector); w[45] = __byte_perm_S (w[19], w[18], selector); w[44] = __byte_perm_S (w[18], w[17], selector); w[43] = __byte_perm_S (w[17], w[16], selector); w[42] = __byte_perm_S (w[16], w[15], selector); w[41] = __byte_perm_S (w[15], w[14], selector); w[40] = __byte_perm_S (w[14], w[13], selector); w[39] = __byte_perm_S (w[13], w[12], selector); w[38] = __byte_perm_S (w[12], w[11], selector); w[37] = __byte_perm_S (w[11], w[10], selector); w[36] = __byte_perm_S (w[10], w[ 9], selector); w[35] = __byte_perm_S (w[ 9], w[ 8], selector); w[34] = __byte_perm_S (w[ 8], w[ 7], selector); w[33] = __byte_perm_S (w[ 7], w[ 6], selector); w[32] = __byte_perm_S (w[ 6], w[ 5], selector); w[31] = __byte_perm_S (w[ 5], w[ 4], selector); w[30] = __byte_perm_S (w[ 4], w[ 3], selector); w[29] = __byte_perm_S (w[ 3], w[ 2], selector); w[28] = __byte_perm_S (w[ 2], w[ 1], selector); w[27] = __byte_perm_S (w[ 1], w[ 0], selector); w[26] = __byte_perm_S (w[ 0], 0, selector); w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 27: w[63] = __byte_perm_S (w[36], w[35], selector); w[62] = __byte_perm_S (w[35], w[34], selector); w[61] = __byte_perm_S (w[34], w[33], selector); w[60] = __byte_perm_S (w[33], w[32], selector); w[59] = __byte_perm_S (w[32], w[31], selector); w[58] = __byte_perm_S (w[31], w[30], selector); w[57] = __byte_perm_S (w[30], w[29], selector); w[56] = __byte_perm_S (w[29], w[28], selector); w[55] = __byte_perm_S (w[28], w[27], selector); w[54] = __byte_perm_S (w[27], w[26], selector); w[53] = __byte_perm_S (w[26], w[25], selector); w[52] = __byte_perm_S (w[25], w[24], selector); w[51] = __byte_perm_S (w[24], w[23], selector); w[50] = __byte_perm_S (w[23], w[22], selector); w[49] = __byte_perm_S (w[22], w[21], selector); w[48] = __byte_perm_S (w[21], w[20], selector); w[47] = __byte_perm_S (w[20], w[19], selector); w[46] = __byte_perm_S (w[19], w[18], selector); w[45] = __byte_perm_S (w[18], w[17], selector); w[44] = __byte_perm_S (w[17], w[16], selector); w[43] = __byte_perm_S (w[16], w[15], selector); w[42] = __byte_perm_S (w[15], w[14], selector); w[41] = __byte_perm_S (w[14], w[13], selector); w[40] = __byte_perm_S (w[13], w[12], selector); w[39] = __byte_perm_S (w[12], w[11], selector); w[38] = __byte_perm_S (w[11], w[10], selector); w[37] = __byte_perm_S (w[10], w[ 9], selector); w[36] = __byte_perm_S (w[ 9], w[ 8], selector); w[35] = __byte_perm_S (w[ 8], w[ 7], selector); w[34] = __byte_perm_S (w[ 7], w[ 6], selector); w[33] = __byte_perm_S (w[ 6], w[ 5], selector); w[32] = __byte_perm_S (w[ 5], w[ 4], selector); w[31] = __byte_perm_S (w[ 4], w[ 3], selector); w[30] = __byte_perm_S (w[ 3], w[ 2], selector); w[29] = __byte_perm_S (w[ 2], w[ 1], selector); w[28] = __byte_perm_S (w[ 1], w[ 0], selector); w[27] = __byte_perm_S (w[ 0], 0, selector); w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 28: w[63] = __byte_perm_S (w[35], w[34], selector); w[62] = __byte_perm_S (w[34], w[33], selector); w[61] = __byte_perm_S (w[33], w[32], selector); w[60] = __byte_perm_S (w[32], w[31], selector); w[59] = __byte_perm_S (w[31], w[30], selector); w[58] = __byte_perm_S (w[30], w[29], selector); w[57] = __byte_perm_S (w[29], w[28], selector); w[56] = __byte_perm_S (w[28], w[27], selector); w[55] = __byte_perm_S (w[27], w[26], selector); w[54] = __byte_perm_S (w[26], w[25], selector); w[53] = __byte_perm_S (w[25], w[24], selector); w[52] = __byte_perm_S (w[24], w[23], selector); w[51] = __byte_perm_S (w[23], w[22], selector); w[50] = __byte_perm_S (w[22], w[21], selector); w[49] = __byte_perm_S (w[21], w[20], selector); w[48] = __byte_perm_S (w[20], w[19], selector); w[47] = __byte_perm_S (w[19], w[18], selector); w[46] = __byte_perm_S (w[18], w[17], selector); w[45] = __byte_perm_S (w[17], w[16], selector); w[44] = __byte_perm_S (w[16], w[15], selector); w[43] = __byte_perm_S (w[15], w[14], selector); w[42] = __byte_perm_S (w[14], w[13], selector); w[41] = __byte_perm_S (w[13], w[12], selector); w[40] = __byte_perm_S (w[12], w[11], selector); w[39] = __byte_perm_S (w[11], w[10], selector); w[38] = __byte_perm_S (w[10], w[ 9], selector); w[37] = __byte_perm_S (w[ 9], w[ 8], selector); w[36] = __byte_perm_S (w[ 8], w[ 7], selector); w[35] = __byte_perm_S (w[ 7], w[ 6], selector); w[34] = __byte_perm_S (w[ 6], w[ 5], selector); w[33] = __byte_perm_S (w[ 5], w[ 4], selector); w[32] = __byte_perm_S (w[ 4], w[ 3], selector); w[31] = __byte_perm_S (w[ 3], w[ 2], selector); w[30] = __byte_perm_S (w[ 2], w[ 1], selector); w[29] = __byte_perm_S (w[ 1], w[ 0], selector); w[28] = __byte_perm_S (w[ 0], 0, selector); w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 29: w[63] = __byte_perm_S (w[34], w[33], selector); w[62] = __byte_perm_S (w[33], w[32], selector); w[61] = __byte_perm_S (w[32], w[31], selector); w[60] = __byte_perm_S (w[31], w[30], selector); w[59] = __byte_perm_S (w[30], w[29], selector); w[58] = __byte_perm_S (w[29], w[28], selector); w[57] = __byte_perm_S (w[28], w[27], selector); w[56] = __byte_perm_S (w[27], w[26], selector); w[55] = __byte_perm_S (w[26], w[25], selector); w[54] = __byte_perm_S (w[25], w[24], selector); w[53] = __byte_perm_S (w[24], w[23], selector); w[52] = __byte_perm_S (w[23], w[22], selector); w[51] = __byte_perm_S (w[22], w[21], selector); w[50] = __byte_perm_S (w[21], w[20], selector); w[49] = __byte_perm_S (w[20], w[19], selector); w[48] = __byte_perm_S (w[19], w[18], selector); w[47] = __byte_perm_S (w[18], w[17], selector); w[46] = __byte_perm_S (w[17], w[16], selector); w[45] = __byte_perm_S (w[16], w[15], selector); w[44] = __byte_perm_S (w[15], w[14], selector); w[43] = __byte_perm_S (w[14], w[13], selector); w[42] = __byte_perm_S (w[13], w[12], selector); w[41] = __byte_perm_S (w[12], w[11], selector); w[40] = __byte_perm_S (w[11], w[10], selector); w[39] = __byte_perm_S (w[10], w[ 9], selector); w[38] = __byte_perm_S (w[ 9], w[ 8], selector); w[37] = __byte_perm_S (w[ 8], w[ 7], selector); w[36] = __byte_perm_S (w[ 7], w[ 6], selector); w[35] = __byte_perm_S (w[ 6], w[ 5], selector); w[34] = __byte_perm_S (w[ 5], w[ 4], selector); w[33] = __byte_perm_S (w[ 4], w[ 3], selector); w[32] = __byte_perm_S (w[ 3], w[ 2], selector); w[31] = __byte_perm_S (w[ 2], w[ 1], selector); w[30] = __byte_perm_S (w[ 1], w[ 0], selector); w[29] = __byte_perm_S (w[ 0], 0, selector); w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 30: w[63] = __byte_perm_S (w[33], w[32], selector); w[62] = __byte_perm_S (w[32], w[31], selector); w[61] = __byte_perm_S (w[31], w[30], selector); w[60] = __byte_perm_S (w[30], w[29], selector); w[59] = __byte_perm_S (w[29], w[28], selector); w[58] = __byte_perm_S (w[28], w[27], selector); w[57] = __byte_perm_S (w[27], w[26], selector); w[56] = __byte_perm_S (w[26], w[25], selector); w[55] = __byte_perm_S (w[25], w[24], selector); w[54] = __byte_perm_S (w[24], w[23], selector); w[53] = __byte_perm_S (w[23], w[22], selector); w[52] = __byte_perm_S (w[22], w[21], selector); w[51] = __byte_perm_S (w[21], w[20], selector); w[50] = __byte_perm_S (w[20], w[19], selector); w[49] = __byte_perm_S (w[19], w[18], selector); w[48] = __byte_perm_S (w[18], w[17], selector); w[47] = __byte_perm_S (w[17], w[16], selector); w[46] = __byte_perm_S (w[16], w[15], selector); w[45] = __byte_perm_S (w[15], w[14], selector); w[44] = __byte_perm_S (w[14], w[13], selector); w[43] = __byte_perm_S (w[13], w[12], selector); w[42] = __byte_perm_S (w[12], w[11], selector); w[41] = __byte_perm_S (w[11], w[10], selector); w[40] = __byte_perm_S (w[10], w[ 9], selector); w[39] = __byte_perm_S (w[ 9], w[ 8], selector); w[38] = __byte_perm_S (w[ 8], w[ 7], selector); w[37] = __byte_perm_S (w[ 7], w[ 6], selector); w[36] = __byte_perm_S (w[ 6], w[ 5], selector); w[35] = __byte_perm_S (w[ 5], w[ 4], selector); w[34] = __byte_perm_S (w[ 4], w[ 3], selector); w[33] = __byte_perm_S (w[ 3], w[ 2], selector); w[32] = __byte_perm_S (w[ 2], w[ 1], selector); w[31] = __byte_perm_S (w[ 1], w[ 0], selector); w[30] = __byte_perm_S (w[ 0], 0, selector); w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 31: w[63] = __byte_perm_S (w[32], w[31], selector); w[62] = __byte_perm_S (w[31], w[30], selector); w[61] = __byte_perm_S (w[30], w[29], selector); w[60] = __byte_perm_S (w[29], w[28], selector); w[59] = __byte_perm_S (w[28], w[27], selector); w[58] = __byte_perm_S (w[27], w[26], selector); w[57] = __byte_perm_S (w[26], w[25], selector); w[56] = __byte_perm_S (w[25], w[24], selector); w[55] = __byte_perm_S (w[24], w[23], selector); w[54] = __byte_perm_S (w[23], w[22], selector); w[53] = __byte_perm_S (w[22], w[21], selector); w[52] = __byte_perm_S (w[21], w[20], selector); w[51] = __byte_perm_S (w[20], w[19], selector); w[50] = __byte_perm_S (w[19], w[18], selector); w[49] = __byte_perm_S (w[18], w[17], selector); w[48] = __byte_perm_S (w[17], w[16], selector); w[47] = __byte_perm_S (w[16], w[15], selector); w[46] = __byte_perm_S (w[15], w[14], selector); w[45] = __byte_perm_S (w[14], w[13], selector); w[44] = __byte_perm_S (w[13], w[12], selector); w[43] = __byte_perm_S (w[12], w[11], selector); w[42] = __byte_perm_S (w[11], w[10], selector); w[41] = __byte_perm_S (w[10], w[ 9], selector); w[40] = __byte_perm_S (w[ 9], w[ 8], selector); w[39] = __byte_perm_S (w[ 8], w[ 7], selector); w[38] = __byte_perm_S (w[ 7], w[ 6], selector); w[37] = __byte_perm_S (w[ 6], w[ 5], selector); w[36] = __byte_perm_S (w[ 5], w[ 4], selector); w[35] = __byte_perm_S (w[ 4], w[ 3], selector); w[34] = __byte_perm_S (w[ 3], w[ 2], selector); w[33] = __byte_perm_S (w[ 2], w[ 1], selector); w[32] = __byte_perm_S (w[ 1], w[ 0], selector); w[31] = __byte_perm_S (w[ 0], 0, selector); w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 32: w[63] = __byte_perm_S (w[31], w[30], selector); w[62] = __byte_perm_S (w[30], w[29], selector); w[61] = __byte_perm_S (w[29], w[28], selector); w[60] = __byte_perm_S (w[28], w[27], selector); w[59] = __byte_perm_S (w[27], w[26], selector); w[58] = __byte_perm_S (w[26], w[25], selector); w[57] = __byte_perm_S (w[25], w[24], selector); w[56] = __byte_perm_S (w[24], w[23], selector); w[55] = __byte_perm_S (w[23], w[22], selector); w[54] = __byte_perm_S (w[22], w[21], selector); w[53] = __byte_perm_S (w[21], w[20], selector); w[52] = __byte_perm_S (w[20], w[19], selector); w[51] = __byte_perm_S (w[19], w[18], selector); w[50] = __byte_perm_S (w[18], w[17], selector); w[49] = __byte_perm_S (w[17], w[16], selector); w[48] = __byte_perm_S (w[16], w[15], selector); w[47] = __byte_perm_S (w[15], w[14], selector); w[46] = __byte_perm_S (w[14], w[13], selector); w[45] = __byte_perm_S (w[13], w[12], selector); w[44] = __byte_perm_S (w[12], w[11], selector); w[43] = __byte_perm_S (w[11], w[10], selector); w[42] = __byte_perm_S (w[10], w[ 9], selector); w[41] = __byte_perm_S (w[ 9], w[ 8], selector); w[40] = __byte_perm_S (w[ 8], w[ 7], selector); w[39] = __byte_perm_S (w[ 7], w[ 6], selector); w[38] = __byte_perm_S (w[ 6], w[ 5], selector); w[37] = __byte_perm_S (w[ 5], w[ 4], selector); w[36] = __byte_perm_S (w[ 4], w[ 3], selector); w[35] = __byte_perm_S (w[ 3], w[ 2], selector); w[34] = __byte_perm_S (w[ 2], w[ 1], selector); w[33] = __byte_perm_S (w[ 1], w[ 0], selector); w[32] = __byte_perm_S (w[ 0], 0, selector); w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 33: w[63] = __byte_perm_S (w[30], w[29], selector); w[62] = __byte_perm_S (w[29], w[28], selector); w[61] = __byte_perm_S (w[28], w[27], selector); w[60] = __byte_perm_S (w[27], w[26], selector); w[59] = __byte_perm_S (w[26], w[25], selector); w[58] = __byte_perm_S (w[25], w[24], selector); w[57] = __byte_perm_S (w[24], w[23], selector); w[56] = __byte_perm_S (w[23], w[22], selector); w[55] = __byte_perm_S (w[22], w[21], selector); w[54] = __byte_perm_S (w[21], w[20], selector); w[53] = __byte_perm_S (w[20], w[19], selector); w[52] = __byte_perm_S (w[19], w[18], selector); w[51] = __byte_perm_S (w[18], w[17], selector); w[50] = __byte_perm_S (w[17], w[16], selector); w[49] = __byte_perm_S (w[16], w[15], selector); w[48] = __byte_perm_S (w[15], w[14], selector); w[47] = __byte_perm_S (w[14], w[13], selector); w[46] = __byte_perm_S (w[13], w[12], selector); w[45] = __byte_perm_S (w[12], w[11], selector); w[44] = __byte_perm_S (w[11], w[10], selector); w[43] = __byte_perm_S (w[10], w[ 9], selector); w[42] = __byte_perm_S (w[ 9], w[ 8], selector); w[41] = __byte_perm_S (w[ 8], w[ 7], selector); w[40] = __byte_perm_S (w[ 7], w[ 6], selector); w[39] = __byte_perm_S (w[ 6], w[ 5], selector); w[38] = __byte_perm_S (w[ 5], w[ 4], selector); w[37] = __byte_perm_S (w[ 4], w[ 3], selector); w[36] = __byte_perm_S (w[ 3], w[ 2], selector); w[35] = __byte_perm_S (w[ 2], w[ 1], selector); w[34] = __byte_perm_S (w[ 1], w[ 0], selector); w[33] = __byte_perm_S (w[ 0], 0, selector); w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 34: w[63] = __byte_perm_S (w[29], w[28], selector); w[62] = __byte_perm_S (w[28], w[27], selector); w[61] = __byte_perm_S (w[27], w[26], selector); w[60] = __byte_perm_S (w[26], w[25], selector); w[59] = __byte_perm_S (w[25], w[24], selector); w[58] = __byte_perm_S (w[24], w[23], selector); w[57] = __byte_perm_S (w[23], w[22], selector); w[56] = __byte_perm_S (w[22], w[21], selector); w[55] = __byte_perm_S (w[21], w[20], selector); w[54] = __byte_perm_S (w[20], w[19], selector); w[53] = __byte_perm_S (w[19], w[18], selector); w[52] = __byte_perm_S (w[18], w[17], selector); w[51] = __byte_perm_S (w[17], w[16], selector); w[50] = __byte_perm_S (w[16], w[15], selector); w[49] = __byte_perm_S (w[15], w[14], selector); w[48] = __byte_perm_S (w[14], w[13], selector); w[47] = __byte_perm_S (w[13], w[12], selector); w[46] = __byte_perm_S (w[12], w[11], selector); w[45] = __byte_perm_S (w[11], w[10], selector); w[44] = __byte_perm_S (w[10], w[ 9], selector); w[43] = __byte_perm_S (w[ 9], w[ 8], selector); w[42] = __byte_perm_S (w[ 8], w[ 7], selector); w[41] = __byte_perm_S (w[ 7], w[ 6], selector); w[40] = __byte_perm_S (w[ 6], w[ 5], selector); w[39] = __byte_perm_S (w[ 5], w[ 4], selector); w[38] = __byte_perm_S (w[ 4], w[ 3], selector); w[37] = __byte_perm_S (w[ 3], w[ 2], selector); w[36] = __byte_perm_S (w[ 2], w[ 1], selector); w[35] = __byte_perm_S (w[ 1], w[ 0], selector); w[34] = __byte_perm_S (w[ 0], 0, selector); w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 35: w[63] = __byte_perm_S (w[28], w[27], selector); w[62] = __byte_perm_S (w[27], w[26], selector); w[61] = __byte_perm_S (w[26], w[25], selector); w[60] = __byte_perm_S (w[25], w[24], selector); w[59] = __byte_perm_S (w[24], w[23], selector); w[58] = __byte_perm_S (w[23], w[22], selector); w[57] = __byte_perm_S (w[22], w[21], selector); w[56] = __byte_perm_S (w[21], w[20], selector); w[55] = __byte_perm_S (w[20], w[19], selector); w[54] = __byte_perm_S (w[19], w[18], selector); w[53] = __byte_perm_S (w[18], w[17], selector); w[52] = __byte_perm_S (w[17], w[16], selector); w[51] = __byte_perm_S (w[16], w[15], selector); w[50] = __byte_perm_S (w[15], w[14], selector); w[49] = __byte_perm_S (w[14], w[13], selector); w[48] = __byte_perm_S (w[13], w[12], selector); w[47] = __byte_perm_S (w[12], w[11], selector); w[46] = __byte_perm_S (w[11], w[10], selector); w[45] = __byte_perm_S (w[10], w[ 9], selector); w[44] = __byte_perm_S (w[ 9], w[ 8], selector); w[43] = __byte_perm_S (w[ 8], w[ 7], selector); w[42] = __byte_perm_S (w[ 7], w[ 6], selector); w[41] = __byte_perm_S (w[ 6], w[ 5], selector); w[40] = __byte_perm_S (w[ 5], w[ 4], selector); w[39] = __byte_perm_S (w[ 4], w[ 3], selector); w[38] = __byte_perm_S (w[ 3], w[ 2], selector); w[37] = __byte_perm_S (w[ 2], w[ 1], selector); w[36] = __byte_perm_S (w[ 1], w[ 0], selector); w[35] = __byte_perm_S (w[ 0], 0, selector); w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 36: w[63] = __byte_perm_S (w[27], w[26], selector); w[62] = __byte_perm_S (w[26], w[25], selector); w[61] = __byte_perm_S (w[25], w[24], selector); w[60] = __byte_perm_S (w[24], w[23], selector); w[59] = __byte_perm_S (w[23], w[22], selector); w[58] = __byte_perm_S (w[22], w[21], selector); w[57] = __byte_perm_S (w[21], w[20], selector); w[56] = __byte_perm_S (w[20], w[19], selector); w[55] = __byte_perm_S (w[19], w[18], selector); w[54] = __byte_perm_S (w[18], w[17], selector); w[53] = __byte_perm_S (w[17], w[16], selector); w[52] = __byte_perm_S (w[16], w[15], selector); w[51] = __byte_perm_S (w[15], w[14], selector); w[50] = __byte_perm_S (w[14], w[13], selector); w[49] = __byte_perm_S (w[13], w[12], selector); w[48] = __byte_perm_S (w[12], w[11], selector); w[47] = __byte_perm_S (w[11], w[10], selector); w[46] = __byte_perm_S (w[10], w[ 9], selector); w[45] = __byte_perm_S (w[ 9], w[ 8], selector); w[44] = __byte_perm_S (w[ 8], w[ 7], selector); w[43] = __byte_perm_S (w[ 7], w[ 6], selector); w[42] = __byte_perm_S (w[ 6], w[ 5], selector); w[41] = __byte_perm_S (w[ 5], w[ 4], selector); w[40] = __byte_perm_S (w[ 4], w[ 3], selector); w[39] = __byte_perm_S (w[ 3], w[ 2], selector); w[38] = __byte_perm_S (w[ 2], w[ 1], selector); w[37] = __byte_perm_S (w[ 1], w[ 0], selector); w[36] = __byte_perm_S (w[ 0], 0, selector); w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 37: w[63] = __byte_perm_S (w[26], w[25], selector); w[62] = __byte_perm_S (w[25], w[24], selector); w[61] = __byte_perm_S (w[24], w[23], selector); w[60] = __byte_perm_S (w[23], w[22], selector); w[59] = __byte_perm_S (w[22], w[21], selector); w[58] = __byte_perm_S (w[21], w[20], selector); w[57] = __byte_perm_S (w[20], w[19], selector); w[56] = __byte_perm_S (w[19], w[18], selector); w[55] = __byte_perm_S (w[18], w[17], selector); w[54] = __byte_perm_S (w[17], w[16], selector); w[53] = __byte_perm_S (w[16], w[15], selector); w[52] = __byte_perm_S (w[15], w[14], selector); w[51] = __byte_perm_S (w[14], w[13], selector); w[50] = __byte_perm_S (w[13], w[12], selector); w[49] = __byte_perm_S (w[12], w[11], selector); w[48] = __byte_perm_S (w[11], w[10], selector); w[47] = __byte_perm_S (w[10], w[ 9], selector); w[46] = __byte_perm_S (w[ 9], w[ 8], selector); w[45] = __byte_perm_S (w[ 8], w[ 7], selector); w[44] = __byte_perm_S (w[ 7], w[ 6], selector); w[43] = __byte_perm_S (w[ 6], w[ 5], selector); w[42] = __byte_perm_S (w[ 5], w[ 4], selector); w[41] = __byte_perm_S (w[ 4], w[ 3], selector); w[40] = __byte_perm_S (w[ 3], w[ 2], selector); w[39] = __byte_perm_S (w[ 2], w[ 1], selector); w[38] = __byte_perm_S (w[ 1], w[ 0], selector); w[37] = __byte_perm_S (w[ 0], 0, selector); w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 38: w[63] = __byte_perm_S (w[25], w[24], selector); w[62] = __byte_perm_S (w[24], w[23], selector); w[61] = __byte_perm_S (w[23], w[22], selector); w[60] = __byte_perm_S (w[22], w[21], selector); w[59] = __byte_perm_S (w[21], w[20], selector); w[58] = __byte_perm_S (w[20], w[19], selector); w[57] = __byte_perm_S (w[19], w[18], selector); w[56] = __byte_perm_S (w[18], w[17], selector); w[55] = __byte_perm_S (w[17], w[16], selector); w[54] = __byte_perm_S (w[16], w[15], selector); w[53] = __byte_perm_S (w[15], w[14], selector); w[52] = __byte_perm_S (w[14], w[13], selector); w[51] = __byte_perm_S (w[13], w[12], selector); w[50] = __byte_perm_S (w[12], w[11], selector); w[49] = __byte_perm_S (w[11], w[10], selector); w[48] = __byte_perm_S (w[10], w[ 9], selector); w[47] = __byte_perm_S (w[ 9], w[ 8], selector); w[46] = __byte_perm_S (w[ 8], w[ 7], selector); w[45] = __byte_perm_S (w[ 7], w[ 6], selector); w[44] = __byte_perm_S (w[ 6], w[ 5], selector); w[43] = __byte_perm_S (w[ 5], w[ 4], selector); w[42] = __byte_perm_S (w[ 4], w[ 3], selector); w[41] = __byte_perm_S (w[ 3], w[ 2], selector); w[40] = __byte_perm_S (w[ 2], w[ 1], selector); w[39] = __byte_perm_S (w[ 1], w[ 0], selector); w[38] = __byte_perm_S (w[ 0], 0, selector); w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 39: w[63] = __byte_perm_S (w[24], w[23], selector); w[62] = __byte_perm_S (w[23], w[22], selector); w[61] = __byte_perm_S (w[22], w[21], selector); w[60] = __byte_perm_S (w[21], w[20], selector); w[59] = __byte_perm_S (w[20], w[19], selector); w[58] = __byte_perm_S (w[19], w[18], selector); w[57] = __byte_perm_S (w[18], w[17], selector); w[56] = __byte_perm_S (w[17], w[16], selector); w[55] = __byte_perm_S (w[16], w[15], selector); w[54] = __byte_perm_S (w[15], w[14], selector); w[53] = __byte_perm_S (w[14], w[13], selector); w[52] = __byte_perm_S (w[13], w[12], selector); w[51] = __byte_perm_S (w[12], w[11], selector); w[50] = __byte_perm_S (w[11], w[10], selector); w[49] = __byte_perm_S (w[10], w[ 9], selector); w[48] = __byte_perm_S (w[ 9], w[ 8], selector); w[47] = __byte_perm_S (w[ 8], w[ 7], selector); w[46] = __byte_perm_S (w[ 7], w[ 6], selector); w[45] = __byte_perm_S (w[ 6], w[ 5], selector); w[44] = __byte_perm_S (w[ 5], w[ 4], selector); w[43] = __byte_perm_S (w[ 4], w[ 3], selector); w[42] = __byte_perm_S (w[ 3], w[ 2], selector); w[41] = __byte_perm_S (w[ 2], w[ 1], selector); w[40] = __byte_perm_S (w[ 1], w[ 0], selector); w[39] = __byte_perm_S (w[ 0], 0, selector); w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 40: w[63] = __byte_perm_S (w[23], w[22], selector); w[62] = __byte_perm_S (w[22], w[21], selector); w[61] = __byte_perm_S (w[21], w[20], selector); w[60] = __byte_perm_S (w[20], w[19], selector); w[59] = __byte_perm_S (w[19], w[18], selector); w[58] = __byte_perm_S (w[18], w[17], selector); w[57] = __byte_perm_S (w[17], w[16], selector); w[56] = __byte_perm_S (w[16], w[15], selector); w[55] = __byte_perm_S (w[15], w[14], selector); w[54] = __byte_perm_S (w[14], w[13], selector); w[53] = __byte_perm_S (w[13], w[12], selector); w[52] = __byte_perm_S (w[12], w[11], selector); w[51] = __byte_perm_S (w[11], w[10], selector); w[50] = __byte_perm_S (w[10], w[ 9], selector); w[49] = __byte_perm_S (w[ 9], w[ 8], selector); w[48] = __byte_perm_S (w[ 8], w[ 7], selector); w[47] = __byte_perm_S (w[ 7], w[ 6], selector); w[46] = __byte_perm_S (w[ 6], w[ 5], selector); w[45] = __byte_perm_S (w[ 5], w[ 4], selector); w[44] = __byte_perm_S (w[ 4], w[ 3], selector); w[43] = __byte_perm_S (w[ 3], w[ 2], selector); w[42] = __byte_perm_S (w[ 2], w[ 1], selector); w[41] = __byte_perm_S (w[ 1], w[ 0], selector); w[40] = __byte_perm_S (w[ 0], 0, selector); w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 41: w[63] = __byte_perm_S (w[22], w[21], selector); w[62] = __byte_perm_S (w[21], w[20], selector); w[61] = __byte_perm_S (w[20], w[19], selector); w[60] = __byte_perm_S (w[19], w[18], selector); w[59] = __byte_perm_S (w[18], w[17], selector); w[58] = __byte_perm_S (w[17], w[16], selector); w[57] = __byte_perm_S (w[16], w[15], selector); w[56] = __byte_perm_S (w[15], w[14], selector); w[55] = __byte_perm_S (w[14], w[13], selector); w[54] = __byte_perm_S (w[13], w[12], selector); w[53] = __byte_perm_S (w[12], w[11], selector); w[52] = __byte_perm_S (w[11], w[10], selector); w[51] = __byte_perm_S (w[10], w[ 9], selector); w[50] = __byte_perm_S (w[ 9], w[ 8], selector); w[49] = __byte_perm_S (w[ 8], w[ 7], selector); w[48] = __byte_perm_S (w[ 7], w[ 6], selector); w[47] = __byte_perm_S (w[ 6], w[ 5], selector); w[46] = __byte_perm_S (w[ 5], w[ 4], selector); w[45] = __byte_perm_S (w[ 4], w[ 3], selector); w[44] = __byte_perm_S (w[ 3], w[ 2], selector); w[43] = __byte_perm_S (w[ 2], w[ 1], selector); w[42] = __byte_perm_S (w[ 1], w[ 0], selector); w[41] = __byte_perm_S (w[ 0], 0, selector); w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 42: w[63] = __byte_perm_S (w[21], w[20], selector); w[62] = __byte_perm_S (w[20], w[19], selector); w[61] = __byte_perm_S (w[19], w[18], selector); w[60] = __byte_perm_S (w[18], w[17], selector); w[59] = __byte_perm_S (w[17], w[16], selector); w[58] = __byte_perm_S (w[16], w[15], selector); w[57] = __byte_perm_S (w[15], w[14], selector); w[56] = __byte_perm_S (w[14], w[13], selector); w[55] = __byte_perm_S (w[13], w[12], selector); w[54] = __byte_perm_S (w[12], w[11], selector); w[53] = __byte_perm_S (w[11], w[10], selector); w[52] = __byte_perm_S (w[10], w[ 9], selector); w[51] = __byte_perm_S (w[ 9], w[ 8], selector); w[50] = __byte_perm_S (w[ 8], w[ 7], selector); w[49] = __byte_perm_S (w[ 7], w[ 6], selector); w[48] = __byte_perm_S (w[ 6], w[ 5], selector); w[47] = __byte_perm_S (w[ 5], w[ 4], selector); w[46] = __byte_perm_S (w[ 4], w[ 3], selector); w[45] = __byte_perm_S (w[ 3], w[ 2], selector); w[44] = __byte_perm_S (w[ 2], w[ 1], selector); w[43] = __byte_perm_S (w[ 1], w[ 0], selector); w[42] = __byte_perm_S (w[ 0], 0, selector); w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 43: w[63] = __byte_perm_S (w[20], w[19], selector); w[62] = __byte_perm_S (w[19], w[18], selector); w[61] = __byte_perm_S (w[18], w[17], selector); w[60] = __byte_perm_S (w[17], w[16], selector); w[59] = __byte_perm_S (w[16], w[15], selector); w[58] = __byte_perm_S (w[15], w[14], selector); w[57] = __byte_perm_S (w[14], w[13], selector); w[56] = __byte_perm_S (w[13], w[12], selector); w[55] = __byte_perm_S (w[12], w[11], selector); w[54] = __byte_perm_S (w[11], w[10], selector); w[53] = __byte_perm_S (w[10], w[ 9], selector); w[52] = __byte_perm_S (w[ 9], w[ 8], selector); w[51] = __byte_perm_S (w[ 8], w[ 7], selector); w[50] = __byte_perm_S (w[ 7], w[ 6], selector); w[49] = __byte_perm_S (w[ 6], w[ 5], selector); w[48] = __byte_perm_S (w[ 5], w[ 4], selector); w[47] = __byte_perm_S (w[ 4], w[ 3], selector); w[46] = __byte_perm_S (w[ 3], w[ 2], selector); w[45] = __byte_perm_S (w[ 2], w[ 1], selector); w[44] = __byte_perm_S (w[ 1], w[ 0], selector); w[43] = __byte_perm_S (w[ 0], 0, selector); w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 44: w[63] = __byte_perm_S (w[19], w[18], selector); w[62] = __byte_perm_S (w[18], w[17], selector); w[61] = __byte_perm_S (w[17], w[16], selector); w[60] = __byte_perm_S (w[16], w[15], selector); w[59] = __byte_perm_S (w[15], w[14], selector); w[58] = __byte_perm_S (w[14], w[13], selector); w[57] = __byte_perm_S (w[13], w[12], selector); w[56] = __byte_perm_S (w[12], w[11], selector); w[55] = __byte_perm_S (w[11], w[10], selector); w[54] = __byte_perm_S (w[10], w[ 9], selector); w[53] = __byte_perm_S (w[ 9], w[ 8], selector); w[52] = __byte_perm_S (w[ 8], w[ 7], selector); w[51] = __byte_perm_S (w[ 7], w[ 6], selector); w[50] = __byte_perm_S (w[ 6], w[ 5], selector); w[49] = __byte_perm_S (w[ 5], w[ 4], selector); w[48] = __byte_perm_S (w[ 4], w[ 3], selector); w[47] = __byte_perm_S (w[ 3], w[ 2], selector); w[46] = __byte_perm_S (w[ 2], w[ 1], selector); w[45] = __byte_perm_S (w[ 1], w[ 0], selector); w[44] = __byte_perm_S (w[ 0], 0, selector); w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 45: w[63] = __byte_perm_S (w[18], w[17], selector); w[62] = __byte_perm_S (w[17], w[16], selector); w[61] = __byte_perm_S (w[16], w[15], selector); w[60] = __byte_perm_S (w[15], w[14], selector); w[59] = __byte_perm_S (w[14], w[13], selector); w[58] = __byte_perm_S (w[13], w[12], selector); w[57] = __byte_perm_S (w[12], w[11], selector); w[56] = __byte_perm_S (w[11], w[10], selector); w[55] = __byte_perm_S (w[10], w[ 9], selector); w[54] = __byte_perm_S (w[ 9], w[ 8], selector); w[53] = __byte_perm_S (w[ 8], w[ 7], selector); w[52] = __byte_perm_S (w[ 7], w[ 6], selector); w[51] = __byte_perm_S (w[ 6], w[ 5], selector); w[50] = __byte_perm_S (w[ 5], w[ 4], selector); w[49] = __byte_perm_S (w[ 4], w[ 3], selector); w[48] = __byte_perm_S (w[ 3], w[ 2], selector); w[47] = __byte_perm_S (w[ 2], w[ 1], selector); w[46] = __byte_perm_S (w[ 1], w[ 0], selector); w[45] = __byte_perm_S (w[ 0], 0, selector); w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 46: w[63] = __byte_perm_S (w[17], w[16], selector); w[62] = __byte_perm_S (w[16], w[15], selector); w[61] = __byte_perm_S (w[15], w[14], selector); w[60] = __byte_perm_S (w[14], w[13], selector); w[59] = __byte_perm_S (w[13], w[12], selector); w[58] = __byte_perm_S (w[12], w[11], selector); w[57] = __byte_perm_S (w[11], w[10], selector); w[56] = __byte_perm_S (w[10], w[ 9], selector); w[55] = __byte_perm_S (w[ 9], w[ 8], selector); w[54] = __byte_perm_S (w[ 8], w[ 7], selector); w[53] = __byte_perm_S (w[ 7], w[ 6], selector); w[52] = __byte_perm_S (w[ 6], w[ 5], selector); w[51] = __byte_perm_S (w[ 5], w[ 4], selector); w[50] = __byte_perm_S (w[ 4], w[ 3], selector); w[49] = __byte_perm_S (w[ 3], w[ 2], selector); w[48] = __byte_perm_S (w[ 2], w[ 1], selector); w[47] = __byte_perm_S (w[ 1], w[ 0], selector); w[46] = __byte_perm_S (w[ 0], 0, selector); w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 47: w[63] = __byte_perm_S (w[16], w[15], selector); w[62] = __byte_perm_S (w[15], w[14], selector); w[61] = __byte_perm_S (w[14], w[13], selector); w[60] = __byte_perm_S (w[13], w[12], selector); w[59] = __byte_perm_S (w[12], w[11], selector); w[58] = __byte_perm_S (w[11], w[10], selector); w[57] = __byte_perm_S (w[10], w[ 9], selector); w[56] = __byte_perm_S (w[ 9], w[ 8], selector); w[55] = __byte_perm_S (w[ 8], w[ 7], selector); w[54] = __byte_perm_S (w[ 7], w[ 6], selector); w[53] = __byte_perm_S (w[ 6], w[ 5], selector); w[52] = __byte_perm_S (w[ 5], w[ 4], selector); w[51] = __byte_perm_S (w[ 4], w[ 3], selector); w[50] = __byte_perm_S (w[ 3], w[ 2], selector); w[49] = __byte_perm_S (w[ 2], w[ 1], selector); w[48] = __byte_perm_S (w[ 1], w[ 0], selector); w[47] = __byte_perm_S (w[ 0], 0, selector); w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 48: w[63] = __byte_perm_S (w[15], w[14], selector); w[62] = __byte_perm_S (w[14], w[13], selector); w[61] = __byte_perm_S (w[13], w[12], selector); w[60] = __byte_perm_S (w[12], w[11], selector); w[59] = __byte_perm_S (w[11], w[10], selector); w[58] = __byte_perm_S (w[10], w[ 9], selector); w[57] = __byte_perm_S (w[ 9], w[ 8], selector); w[56] = __byte_perm_S (w[ 8], w[ 7], selector); w[55] = __byte_perm_S (w[ 7], w[ 6], selector); w[54] = __byte_perm_S (w[ 6], w[ 5], selector); w[53] = __byte_perm_S (w[ 5], w[ 4], selector); w[52] = __byte_perm_S (w[ 4], w[ 3], selector); w[51] = __byte_perm_S (w[ 3], w[ 2], selector); w[50] = __byte_perm_S (w[ 2], w[ 1], selector); w[49] = __byte_perm_S (w[ 1], w[ 0], selector); w[48] = __byte_perm_S (w[ 0], 0, selector); w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 49: w[63] = __byte_perm_S (w[14], w[13], selector); w[62] = __byte_perm_S (w[13], w[12], selector); w[61] = __byte_perm_S (w[12], w[11], selector); w[60] = __byte_perm_S (w[11], w[10], selector); w[59] = __byte_perm_S (w[10], w[ 9], selector); w[58] = __byte_perm_S (w[ 9], w[ 8], selector); w[57] = __byte_perm_S (w[ 8], w[ 7], selector); w[56] = __byte_perm_S (w[ 7], w[ 6], selector); w[55] = __byte_perm_S (w[ 6], w[ 5], selector); w[54] = __byte_perm_S (w[ 5], w[ 4], selector); w[53] = __byte_perm_S (w[ 4], w[ 3], selector); w[52] = __byte_perm_S (w[ 3], w[ 2], selector); w[51] = __byte_perm_S (w[ 2], w[ 1], selector); w[50] = __byte_perm_S (w[ 1], w[ 0], selector); w[49] = __byte_perm_S (w[ 0], 0, selector); w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 50: w[63] = __byte_perm_S (w[13], w[12], selector); w[62] = __byte_perm_S (w[12], w[11], selector); w[61] = __byte_perm_S (w[11], w[10], selector); w[60] = __byte_perm_S (w[10], w[ 9], selector); w[59] = __byte_perm_S (w[ 9], w[ 8], selector); w[58] = __byte_perm_S (w[ 8], w[ 7], selector); w[57] = __byte_perm_S (w[ 7], w[ 6], selector); w[56] = __byte_perm_S (w[ 6], w[ 5], selector); w[55] = __byte_perm_S (w[ 5], w[ 4], selector); w[54] = __byte_perm_S (w[ 4], w[ 3], selector); w[53] = __byte_perm_S (w[ 3], w[ 2], selector); w[52] = __byte_perm_S (w[ 2], w[ 1], selector); w[51] = __byte_perm_S (w[ 1], w[ 0], selector); w[50] = __byte_perm_S (w[ 0], 0, selector); w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 51: w[63] = __byte_perm_S (w[12], w[11], selector); w[62] = __byte_perm_S (w[11], w[10], selector); w[61] = __byte_perm_S (w[10], w[ 9], selector); w[60] = __byte_perm_S (w[ 9], w[ 8], selector); w[59] = __byte_perm_S (w[ 8], w[ 7], selector); w[58] = __byte_perm_S (w[ 7], w[ 6], selector); w[57] = __byte_perm_S (w[ 6], w[ 5], selector); w[56] = __byte_perm_S (w[ 5], w[ 4], selector); w[55] = __byte_perm_S (w[ 4], w[ 3], selector); w[54] = __byte_perm_S (w[ 3], w[ 2], selector); w[53] = __byte_perm_S (w[ 2], w[ 1], selector); w[52] = __byte_perm_S (w[ 1], w[ 0], selector); w[51] = __byte_perm_S (w[ 0], 0, selector); w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 52: w[63] = __byte_perm_S (w[11], w[10], selector); w[62] = __byte_perm_S (w[10], w[ 9], selector); w[61] = __byte_perm_S (w[ 9], w[ 8], selector); w[60] = __byte_perm_S (w[ 8], w[ 7], selector); w[59] = __byte_perm_S (w[ 7], w[ 6], selector); w[58] = __byte_perm_S (w[ 6], w[ 5], selector); w[57] = __byte_perm_S (w[ 5], w[ 4], selector); w[56] = __byte_perm_S (w[ 4], w[ 3], selector); w[55] = __byte_perm_S (w[ 3], w[ 2], selector); w[54] = __byte_perm_S (w[ 2], w[ 1], selector); w[53] = __byte_perm_S (w[ 1], w[ 0], selector); w[52] = __byte_perm_S (w[ 0], 0, selector); w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 53: w[63] = __byte_perm_S (w[10], w[ 9], selector); w[62] = __byte_perm_S (w[ 9], w[ 8], selector); w[61] = __byte_perm_S (w[ 8], w[ 7], selector); w[60] = __byte_perm_S (w[ 7], w[ 6], selector); w[59] = __byte_perm_S (w[ 6], w[ 5], selector); w[58] = __byte_perm_S (w[ 5], w[ 4], selector); w[57] = __byte_perm_S (w[ 4], w[ 3], selector); w[56] = __byte_perm_S (w[ 3], w[ 2], selector); w[55] = __byte_perm_S (w[ 2], w[ 1], selector); w[54] = __byte_perm_S (w[ 1], w[ 0], selector); w[53] = __byte_perm_S (w[ 0], 0, selector); w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 54: w[63] = __byte_perm_S (w[ 9], w[ 8], selector); w[62] = __byte_perm_S (w[ 8], w[ 7], selector); w[61] = __byte_perm_S (w[ 7], w[ 6], selector); w[60] = __byte_perm_S (w[ 6], w[ 5], selector); w[59] = __byte_perm_S (w[ 5], w[ 4], selector); w[58] = __byte_perm_S (w[ 4], w[ 3], selector); w[57] = __byte_perm_S (w[ 3], w[ 2], selector); w[56] = __byte_perm_S (w[ 2], w[ 1], selector); w[55] = __byte_perm_S (w[ 1], w[ 0], selector); w[54] = __byte_perm_S (w[ 0], 0, selector); w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 55: w[63] = __byte_perm_S (w[ 8], w[ 7], selector); w[62] = __byte_perm_S (w[ 7], w[ 6], selector); w[61] = __byte_perm_S (w[ 6], w[ 5], selector); w[60] = __byte_perm_S (w[ 5], w[ 4], selector); w[59] = __byte_perm_S (w[ 4], w[ 3], selector); w[58] = __byte_perm_S (w[ 3], w[ 2], selector); w[57] = __byte_perm_S (w[ 2], w[ 1], selector); w[56] = __byte_perm_S (w[ 1], w[ 0], selector); w[55] = __byte_perm_S (w[ 0], 0, selector); w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 56: w[63] = __byte_perm_S (w[ 7], w[ 6], selector); w[62] = __byte_perm_S (w[ 6], w[ 5], selector); w[61] = __byte_perm_S (w[ 5], w[ 4], selector); w[60] = __byte_perm_S (w[ 4], w[ 3], selector); w[59] = __byte_perm_S (w[ 3], w[ 2], selector); w[58] = __byte_perm_S (w[ 2], w[ 1], selector); w[57] = __byte_perm_S (w[ 1], w[ 0], selector); w[56] = __byte_perm_S (w[ 0], 0, selector); w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 57: w[63] = __byte_perm_S (w[ 6], w[ 5], selector); w[62] = __byte_perm_S (w[ 5], w[ 4], selector); w[61] = __byte_perm_S (w[ 4], w[ 3], selector); w[60] = __byte_perm_S (w[ 3], w[ 2], selector); w[59] = __byte_perm_S (w[ 2], w[ 1], selector); w[58] = __byte_perm_S (w[ 1], w[ 0], selector); w[57] = __byte_perm_S (w[ 0], 0, selector); w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 58: w[63] = __byte_perm_S (w[ 5], w[ 4], selector); w[62] = __byte_perm_S (w[ 4], w[ 3], selector); w[61] = __byte_perm_S (w[ 3], w[ 2], selector); w[60] = __byte_perm_S (w[ 2], w[ 1], selector); w[59] = __byte_perm_S (w[ 1], w[ 0], selector); w[58] = __byte_perm_S (w[ 0], 0, selector); w[57] = 0; w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 59: w[63] = __byte_perm_S (w[ 4], w[ 3], selector); w[62] = __byte_perm_S (w[ 3], w[ 2], selector); w[61] = __byte_perm_S (w[ 2], w[ 1], selector); w[60] = __byte_perm_S (w[ 1], w[ 0], selector); w[59] = __byte_perm_S (w[ 0], 0, selector); w[58] = 0; w[57] = 0; w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 60: w[63] = __byte_perm_S (w[ 3], w[ 2], selector); w[62] = __byte_perm_S (w[ 2], w[ 1], selector); w[61] = __byte_perm_S (w[ 1], w[ 0], selector); w[60] = __byte_perm_S (w[ 0], 0, selector); w[59] = 0; w[58] = 0; w[57] = 0; w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 61: w[63] = __byte_perm_S (w[ 2], w[ 1], selector); w[62] = __byte_perm_S (w[ 1], w[ 0], selector); w[61] = __byte_perm_S (w[ 0], 0, selector); w[60] = 0; w[59] = 0; w[58] = 0; w[57] = 0; w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 62: w[63] = __byte_perm_S (w[ 1], w[ 0], selector); w[62] = __byte_perm_S (w[ 0], 0, selector); w[61] = 0; w[60] = 0; w[59] = 0; w[58] = 0; w[57] = 0; w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; case 63: w[63] = __byte_perm_S (w[ 0], 0, selector); w[62] = 0; w[61] = 0; w[60] = 0; w[59] = 0; w[58] = 0; w[57] = 0; w[56] = 0; w[55] = 0; w[54] = 0; w[53] = 0; w[52] = 0; w[51] = 0; w[50] = 0; w[49] = 0; w[48] = 0; w[47] = 0; w[46] = 0; w[45] = 0; w[44] = 0; w[43] = 0; w[42] = 0; w[41] = 0; w[40] = 0; w[39] = 0; w[38] = 0; w[37] = 0; w[36] = 0; w[35] = 0; w[34] = 0; w[33] = 0; w[32] = 0; w[31] = 0; w[30] = 0; w[29] = 0; w[28] = 0; w[27] = 0; w[26] = 0; w[25] = 0; w[24] = 0; w[23] = 0; w[22] = 0; w[21] = 0; w[20] = 0; w[19] = 0; w[18] = 0; w[17] = 0; w[16] = 0; w[15] = 0; w[14] = 0; w[13] = 0; w[12] = 0; w[11] = 0; w[10] = 0; w[ 9] = 0; w[ 8] = 0; w[ 7] = 0; w[ 6] = 0; w[ 5] = 0; w[ 4] = 0; w[ 3] = 0; w[ 2] = 0; w[ 1] = 0; w[ 0] = 0; break; } #endif } /** * vector functions on scalar types (for inner loop usage) */ #define PACKVS2(sn,vn,e) \ sn[0] = vn[0].s##e; \ sn[1] = vn[1].s##e; #define PACKSV2(sn,vn,e) \ vn[0].s##e = sn[0]; \ vn[1].s##e = sn[1]; #define PACKVS24(s0,s1,v0,v1,e) \ PACKVS4 (s0, v0, e); \ PACKVS4 (s1, v1, e); #define PACKSV24(s0,s1,v0,v1,e) \ PACKSV4 (s0, v0, e); \ PACKSV4 (s1, v1, e); #define PACKVS4(sn,vn,e) \ sn[0] = vn[0].s##e; \ sn[1] = vn[1].s##e; \ sn[2] = vn[2].s##e; \ sn[3] = vn[3].s##e; #define PACKSV4(sn,vn,e) \ vn[0].s##e = sn[0]; \ vn[1].s##e = sn[1]; \ vn[2].s##e = sn[2]; \ vn[3].s##e = sn[3]; #define PACKVS44(s0,s1,s2,s3,v0,v1,v2,v3,e) \ PACKVS4 (s0, v0, e); \ PACKVS4 (s1, v1, e); \ PACKVS4 (s2, v2, e); \ PACKVS4 (s3, v3, e); #define PACKSV44(s0,s1,s2,s3,v0,v1,v2,v3,e) \ PACKSV4 (s0, v0, e); \ PACKSV4 (s1, v1, e); \ PACKSV4 (s2, v2, e); \ PACKSV4 (s3, v3, e); #define PACKVS84(s0,s1,s2,s3,s4,s5,s6,s7,v0,v1,v2,v3,v4,v5,v6,v7,e) \ PACKVS4 (s0, v0, e); \ PACKVS4 (s1, v1, e); \ PACKVS4 (s2, v2, e); \ PACKVS4 (s3, v3, e); \ PACKVS4 (s4, v4, e); \ PACKVS4 (s5, v5, e); \ PACKVS4 (s6, v6, e); \ PACKVS4 (s7, v7, e); #define PACKSV84(s0,s1,s2,s3,s4,s5,s6,s7,v0,v1,v2,v3,v4,v5,v6,v7,e) \ PACKSV4 (s0, v0, e); \ PACKSV4 (s1, v1, e); \ PACKSV4 (s2, v2, e); \ PACKSV4 (s3, v3, e); \ PACKSV4 (s4, v4, e); \ PACKSV4 (s5, v5, e); \ PACKSV4 (s6, v6, e); \ PACKSV4 (s7, v7, e); DECLSPEC void switch_buffer_by_offset_le_VV (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x offset) { #if VECT_SIZE == 1 switch_buffer_by_offset_le_S (w0, w1, w2, w3, offset); #else u32 t0[4]; u32 t1[4]; u32 t2[4]; u32 t3[4]; #endif #if VECT_SIZE == 2 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); #elif VECT_SIZE == 4 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); #elif VECT_SIZE == 8 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 4); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s4); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 4); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 5); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s5); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 5); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 6); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s6); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 6); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 7); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s7); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 7); #elif VECT_SIZE == 16 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 4); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s4); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 4); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 5); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s5); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 5); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 6); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s6); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 6); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 7); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s7); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 7); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 8); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s8); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 8); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 9); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.s9); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 9); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, a); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.sa); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, a); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, b); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.sb); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, b); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, c); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.sc); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, c); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, d); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.sd); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, d); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, e); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.se); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, e); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, f); switch_buffer_by_offset_le_S (t0, t1, t2, t3, offset.sf); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, f); #endif } DECLSPEC void switch_buffer_by_offset_8x4_le_VV (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], u32x w4[4], u32x w5[4], u32x w6[4], u32x w7[4], const u32x offset) { #if VECT_SIZE == 1 switch_buffer_by_offset_8x4_le_S (w0, w1, w2, w3, w4, w5, w6, w7, offset); #else u32 t0[4]; u32 t1[4]; u32 t2[4]; u32 t3[4]; u32 t4[4]; u32 t5[4]; u32 t6[4]; u32 t7[4]; #endif #if VECT_SIZE == 2 // 1 PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0); switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s0); PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0); // 2 PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1); switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s1); PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1); #elif VECT_SIZE == 4 // 1 PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0); switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s0); PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0); // 2 PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1); switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s1); PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1); // 3 PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 2); switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s2); PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 2); // 4 PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 3); switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s3); PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 3); #elif VECT_SIZE == 8 // 1 PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0); switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s0); PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0); // 2 PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1); switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s1); PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1); // 3 PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 2); switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s2); PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 2); // 4 PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 3); switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s3); PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 3); // 5 PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 4); switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s4); PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 4); // 6 PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 5); switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s5); PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 5); // 7 PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 6); switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s6); PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 6); // 8 PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 7); switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s7); PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 7); #elif VECT_SIZE == 16 // 1 PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0); switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s0); PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 0); // 2 PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1); switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s1); PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 1); // 3 PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 2); switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s2); PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 2); // 4 PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 3); switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s3); PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 3); // 5 PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 4); switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s4); PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 4); // 6 PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 5); switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s5); PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 5); // 7 PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 6); switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s6); PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 6); // 8 PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 7); switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s7); PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 7); // 9 PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 8); switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s8); PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 8); // 10 PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 9); switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.s9); PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, 9); // 11 PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, a); switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.sa); PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, a); // 12 PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, b); switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.sb); PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, b); // 13 PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, c); switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.sc); PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, c); // 14 PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, d); switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.sd); PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, d); // 15 PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, e); switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.se); PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, e); // 16 PACKVS84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, f); switch_buffer_by_offset_8x4_le_S (t0, t1, t2, t3, t4, t5, t6, t7, offset.sf); PACKSV84 (t0, t1, t2, t3, t4, t5, t6, t7, w0, w1, w2, w3, w4, w5, w6, w7, f); #endif } DECLSPEC void append_0x01_2x4_VV (u32x w0[4], u32x w1[4], const u32x offset) { #if VECT_SIZE == 1 append_0x01_2x4_S (w0, w1, offset); #else u32 t0[4]; u32 t1[4]; #endif #if VECT_SIZE == 2 PACKVS24 (t0, t1, w0, w1, 0); append_0x01_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0); PACKVS24 (t0, t1, w0, w1, 1); append_0x01_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1); #elif VECT_SIZE == 4 PACKVS24 (t0, t1, w0, w1, 0); append_0x01_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0); PACKVS24 (t0, t1, w0, w1, 1); append_0x01_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1); PACKVS24 (t0, t1, w0, w1, 2); append_0x01_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2); PACKVS24 (t0, t1, w0, w1, 3); append_0x01_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3); #elif VECT_SIZE == 8 PACKVS24 (t0, t1, w0, w1, 0); append_0x01_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0); PACKVS24 (t0, t1, w0, w1, 1); append_0x01_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1); PACKVS24 (t0, t1, w0, w1, 2); append_0x01_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2); PACKVS24 (t0, t1, w0, w1, 3); append_0x01_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3); PACKVS24 (t0, t1, w0, w1, 4); append_0x01_2x4_S (t0, t1, offset.s4); PACKSV24 (t0, t1, w0, w1, 4); PACKVS24 (t0, t1, w0, w1, 5); append_0x01_2x4_S (t0, t1, offset.s5); PACKSV24 (t0, t1, w0, w1, 5); PACKVS24 (t0, t1, w0, w1, 6); append_0x01_2x4_S (t0, t1, offset.s6); PACKSV24 (t0, t1, w0, w1, 6); PACKVS24 (t0, t1, w0, w1, 7); append_0x01_2x4_S (t0, t1, offset.s7); PACKSV24 (t0, t1, w0, w1, 7); #elif VECT_SIZE == 16 PACKVS24 (t0, t1, w0, w1, 0); append_0x01_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0); PACKVS24 (t0, t1, w0, w1, 1); append_0x01_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1); PACKVS24 (t0, t1, w0, w1, 2); append_0x01_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2); PACKVS24 (t0, t1, w0, w1, 3); append_0x01_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3); PACKVS24 (t0, t1, w0, w1, 4); append_0x01_2x4_S (t0, t1, offset.s4); PACKSV24 (t0, t1, w0, w1, 4); PACKVS24 (t0, t1, w0, w1, 5); append_0x01_2x4_S (t0, t1, offset.s5); PACKSV24 (t0, t1, w0, w1, 5); PACKVS24 (t0, t1, w0, w1, 6); append_0x01_2x4_S (t0, t1, offset.s6); PACKSV24 (t0, t1, w0, w1, 6); PACKVS24 (t0, t1, w0, w1, 7); append_0x01_2x4_S (t0, t1, offset.s7); PACKSV24 (t0, t1, w0, w1, 7); PACKVS24 (t0, t1, w0, w1, 8); append_0x01_2x4_S (t0, t1, offset.s8); PACKSV24 (t0, t1, w0, w1, 8); PACKVS24 (t0, t1, w0, w1, 9); append_0x01_2x4_S (t0, t1, offset.s9); PACKSV24 (t0, t1, w0, w1, 9); PACKVS24 (t0, t1, w0, w1, a); append_0x01_2x4_S (t0, t1, offset.sa); PACKSV24 (t0, t1, w0, w1, a); PACKVS24 (t0, t1, w0, w1, b); append_0x01_2x4_S (t0, t1, offset.sb); PACKSV24 (t0, t1, w0, w1, b); PACKVS24 (t0, t1, w0, w1, c); append_0x01_2x4_S (t0, t1, offset.sc); PACKSV24 (t0, t1, w0, w1, c); PACKVS24 (t0, t1, w0, w1, d); append_0x01_2x4_S (t0, t1, offset.sd); PACKSV24 (t0, t1, w0, w1, d); PACKVS24 (t0, t1, w0, w1, e); append_0x01_2x4_S (t0, t1, offset.se); PACKSV24 (t0, t1, w0, w1, e); PACKVS24 (t0, t1, w0, w1, f); append_0x01_2x4_S (t0, t1, offset.sf); PACKSV24 (t0, t1, w0, w1, f); #endif } DECLSPEC void append_0x80_2x4_VV (u32x w0[4], u32x w1[4], const u32x offset) { #if VECT_SIZE == 1 append_0x80_2x4_S (w0, w1, offset); #else u32 t0[4]; u32 t1[4]; #endif #if VECT_SIZE == 2 PACKVS24 (t0, t1, w0, w1, 0); append_0x80_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0); PACKVS24 (t0, t1, w0, w1, 1); append_0x80_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1); #elif VECT_SIZE == 4 PACKVS24 (t0, t1, w0, w1, 0); append_0x80_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0); PACKVS24 (t0, t1, w0, w1, 1); append_0x80_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1); PACKVS24 (t0, t1, w0, w1, 2); append_0x80_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2); PACKVS24 (t0, t1, w0, w1, 3); append_0x80_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3); #elif VECT_SIZE == 8 PACKVS24 (t0, t1, w0, w1, 0); append_0x80_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0); PACKVS24 (t0, t1, w0, w1, 1); append_0x80_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1); PACKVS24 (t0, t1, w0, w1, 2); append_0x80_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2); PACKVS24 (t0, t1, w0, w1, 3); append_0x80_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3); PACKVS24 (t0, t1, w0, w1, 4); append_0x80_2x4_S (t0, t1, offset.s4); PACKSV24 (t0, t1, w0, w1, 4); PACKVS24 (t0, t1, w0, w1, 5); append_0x80_2x4_S (t0, t1, offset.s5); PACKSV24 (t0, t1, w0, w1, 5); PACKVS24 (t0, t1, w0, w1, 6); append_0x80_2x4_S (t0, t1, offset.s6); PACKSV24 (t0, t1, w0, w1, 6); PACKVS24 (t0, t1, w0, w1, 7); append_0x80_2x4_S (t0, t1, offset.s7); PACKSV24 (t0, t1, w0, w1, 7); #elif VECT_SIZE == 16 PACKVS24 (t0, t1, w0, w1, 0); append_0x80_2x4_S (t0, t1, offset.s0); PACKSV24 (t0, t1, w0, w1, 0); PACKVS24 (t0, t1, w0, w1, 1); append_0x80_2x4_S (t0, t1, offset.s1); PACKSV24 (t0, t1, w0, w1, 1); PACKVS24 (t0, t1, w0, w1, 2); append_0x80_2x4_S (t0, t1, offset.s2); PACKSV24 (t0, t1, w0, w1, 2); PACKVS24 (t0, t1, w0, w1, 3); append_0x80_2x4_S (t0, t1, offset.s3); PACKSV24 (t0, t1, w0, w1, 3); PACKVS24 (t0, t1, w0, w1, 4); append_0x80_2x4_S (t0, t1, offset.s4); PACKSV24 (t0, t1, w0, w1, 4); PACKVS24 (t0, t1, w0, w1, 5); append_0x80_2x4_S (t0, t1, offset.s5); PACKSV24 (t0, t1, w0, w1, 5); PACKVS24 (t0, t1, w0, w1, 6); append_0x80_2x4_S (t0, t1, offset.s6); PACKSV24 (t0, t1, w0, w1, 6); PACKVS24 (t0, t1, w0, w1, 7); append_0x80_2x4_S (t0, t1, offset.s7); PACKSV24 (t0, t1, w0, w1, 7); PACKVS24 (t0, t1, w0, w1, 8); append_0x80_2x4_S (t0, t1, offset.s8); PACKSV24 (t0, t1, w0, w1, 8); PACKVS24 (t0, t1, w0, w1, 9); append_0x80_2x4_S (t0, t1, offset.s9); PACKSV24 (t0, t1, w0, w1, 9); PACKVS24 (t0, t1, w0, w1, a); append_0x80_2x4_S (t0, t1, offset.sa); PACKSV24 (t0, t1, w0, w1, a); PACKVS24 (t0, t1, w0, w1, b); append_0x80_2x4_S (t0, t1, offset.sb); PACKSV24 (t0, t1, w0, w1, b); PACKVS24 (t0, t1, w0, w1, c); append_0x80_2x4_S (t0, t1, offset.sc); PACKSV24 (t0, t1, w0, w1, c); PACKVS24 (t0, t1, w0, w1, d); append_0x80_2x4_S (t0, t1, offset.sd); PACKSV24 (t0, t1, w0, w1, d); PACKVS24 (t0, t1, w0, w1, e); append_0x80_2x4_S (t0, t1, offset.se); PACKSV24 (t0, t1, w0, w1, e); PACKVS24 (t0, t1, w0, w1, f); append_0x80_2x4_S (t0, t1, offset.sf); PACKSV24 (t0, t1, w0, w1, f); #endif } DECLSPEC void append_0x80_4x4_VV (u32x w0[4], u32x w1[4], u32x w2[4], u32x w3[4], const u32x offset) { #if VECT_SIZE == 1 append_0x80_4x4_S (w0, w1, w2, w3, offset); #else u32 t0[4]; u32 t1[4]; u32 t2[4]; u32 t3[4]; #endif #if VECT_SIZE == 2 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); append_0x80_4x4_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); append_0x80_4x4_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); #elif VECT_SIZE == 4 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); append_0x80_4x4_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); append_0x80_4x4_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); append_0x80_4x4_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); append_0x80_4x4_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); #elif VECT_SIZE == 8 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); append_0x80_4x4_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); append_0x80_4x4_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); append_0x80_4x4_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); append_0x80_4x4_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 4); append_0x80_4x4_S (t0, t1, t2, t3, offset.s4); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 4); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 5); append_0x80_4x4_S (t0, t1, t2, t3, offset.s5); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 5); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 6); append_0x80_4x4_S (t0, t1, t2, t3, offset.s6); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 6); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 7); append_0x80_4x4_S (t0, t1, t2, t3, offset.s7); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 7); #elif VECT_SIZE == 16 PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); append_0x80_4x4_S (t0, t1, t2, t3, offset.s0); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 0); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); append_0x80_4x4_S (t0, t1, t2, t3, offset.s1); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 1); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); append_0x80_4x4_S (t0, t1, t2, t3, offset.s2); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 2); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); append_0x80_4x4_S (t0, t1, t2, t3, offset.s3); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 3); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 4); append_0x80_4x4_S (t0, t1, t2, t3, offset.s4); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 4); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 5); append_0x80_4x4_S (t0, t1, t2, t3, offset.s5); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 5); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 6); append_0x80_4x4_S (t0, t1, t2, t3, offset.s6); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 6); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 7); append_0x80_4x4_S (t0, t1, t2, t3, offset.s7); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 7); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 8); append_0x80_4x4_S (t0, t1, t2, t3, offset.s8); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 8); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, 9); append_0x80_4x4_S (t0, t1, t2, t3, offset.s9); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, 9); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, a); append_0x80_4x4_S (t0, t1, t2, t3, offset.sa); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, a); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, b); append_0x80_4x4_S (t0, t1, t2, t3, offset.sb); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, b); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, c); append_0x80_4x4_S (t0, t1, t2, t3, offset.sc); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, c); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, d); append_0x80_4x4_S (t0, t1, t2, t3, offset.sd); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, d); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, e); append_0x80_4x4_S (t0, t1, t2, t3, offset.se); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, e); PACKVS44 (t0, t1, t2, t3, w0, w1, w2, w3, f); append_0x80_4x4_S (t0, t1, t2, t3, offset.sf); PACKSV44 (t0, t1, t2, t3, w0, w1, w2, w3, f); #endif } DECLSPEC void gpu_decompress_entry (__global pw_idx_t *pws_idx, __global u32 *pws_comp, pw_t *pw, const u64 gid) { const u32 off = pws_idx[gid].off; const u32 cnt = pws_idx[gid].cnt; const u32 len = pws_idx[gid].len; #pragma unroll for (u32 i = 0; i < 64; i++) { pw->i[i] = 0; } for (u32 i = 0, j = off; i < cnt; i++, j++) { pw->i[i] = pws_comp[j]; } pw->pw_len = len; } __kernel void gpu_decompress (__global pw_idx_t *pws_idx, __global u32 *pws_comp, __global pw_t *pws_buf, const u64 gid_max) { const u64 gid = get_global_id (0); if (gid >= gid_max) return; pw_t pw; gpu_decompress_entry (pws_idx, pws_comp, &pw, gid); pws_buf[gid] = pw; } __kernel void gpu_memset (__global uint4 *buf, const u32 value, const u64 gid_max) { const u64 gid = get_global_id (0); if (gid >= gid_max) return; buf[gid] = (uint4) (value); } __kernel void gpu_atinit (__global pw_t *buf, const u64 gid_max) { const u64 gid = get_global_id (0); if (gid >= gid_max) return; const u32 l32 = l32_from_64_S (gid); const u32 h32 = h32_from_64_S (gid); pw_t pw; pw.i[ 0] = 0x5c5c5c5c ^ l32; pw.i[ 1] = 0x36363636 ^ h32; pw.i[ 2] = 0; pw.i[ 3] = 0; pw.i[ 4] = 0; pw.i[ 5] = 0; pw.i[ 6] = 0; pw.i[ 7] = 0; pw.i[ 8] = 0; pw.i[ 9] = 0; pw.i[10] = 0; pw.i[11] = 0; pw.i[12] = 0; pw.i[13] = 0; pw.i[14] = 0; pw.i[15] = 0; pw.i[16] = 0; pw.i[17] = 0; pw.i[18] = 0; pw.i[19] = 0; pw.i[20] = 0; pw.i[21] = 0; pw.i[22] = 0; pw.i[23] = 0; pw.i[24] = 0; pw.i[25] = 0; pw.i[26] = 0; pw.i[27] = 0; pw.i[28] = 0; pw.i[29] = 0; pw.i[30] = 0; pw.i[31] = 0; pw.i[32] = 0; pw.i[33] = 0; pw.i[34] = 0; pw.i[35] = 0; pw.i[36] = 0; pw.i[37] = 0; pw.i[38] = 0; pw.i[39] = 0; pw.i[40] = 0; pw.i[41] = 0; pw.i[42] = 0; pw.i[43] = 0; pw.i[44] = 0; pw.i[45] = 0; pw.i[46] = 0; pw.i[47] = 0; pw.i[48] = 0; pw.i[49] = 0; pw.i[50] = 0; pw.i[51] = 0; pw.i[52] = 0; pw.i[53] = 0; pw.i[54] = 0; pw.i[55] = 0; pw.i[56] = 0; pw.i[57] = 0; pw.i[58] = 0; pw.i[59] = 0; pw.i[60] = 0; pw.i[61] = 0; pw.i[62] = 0; pw.i[63] = 0; // yep that's faster //pw.pw_len = 1 + (l32 & 15); pw.pw_len = 7; // some algorithms are very sensible on this (example: 12500) buf[gid] = pw; }